In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import numpy as np
import socket, struct
import netaddr
from sklearn.cluster import KMeans

In [2]:
tp = pd.read_csv('capture20110818.pcap.netflow.labeled.csv', chunksize=1000)
df = pd.concat(tp, ignore_index=True)


#preprocessing
df = df[df['Label']!='Background']

ips = df['SrcIPAddr:Port'].str.split(':')
df['SrcIPAddr'] = ips.str[0]
df['SrcIPPort'] = ips.str[-1]
ips = df['DstIPAddr:Port'].str.split(':')
df['DstIPAddr'] = ips.str[0]
df['DstIPPort'] = ips.str[-1]

#get rid of the host
# df = df[df['SrcIPAddr']!='147.32.84.165']

df = df.drop(['Date', 'Flow_start', 'tmp', 'Flows', 'SrcIPAddr:Port', 'DstIPAddr:Port'], axis=1)
df.head()

Unnamed: 0,Durat,Prot,Flags,Tos,Packets,Bytes,Label,Labels,SrcIPAddr,SrcIPPort,DstIPAddr,DstIPPort
20,4.985,TCP,PA_,0,91,86277,LEGITIMATE,,147.32.80.13,80,147.32.85.88,56949
51,0.0,TCP,A_,0,1,66,LEGITIMATE,,147.32.86.110,48102,74.125.232.214,443
57,4.921,TCP,A_,0,49,3234,LEGITIMATE,,147.32.85.88,56949,147.32.80.13,80
73,4.742,TCP,A_,0,118,7080,LEGITIMATE,,147.32.84.59,2768,74.125.108.208,80
75,0.0,TCP,A_,0,1,60,LEGITIMATE,,147.32.84.59,56058,74.125.232.215,443


In [3]:
# malicious IP hosts 
# df[df['Label']=='Botnet']['SrcIPAddr'].value_counts()
# we are taking most frequent ones (also mentioned in README)
train_botnet_IP = '147.32.84.165'

botnet_IPs = np.array(['147.32.84.209', '147.32.84.208', '147.32.84.207', '147.32.84.204', '147.32.84.206', 
              '147.32.84.192', '147.32.84.191', '147.32.84.193', '147.32.84.205'])
normal_IPs = np.array(['147.32.84.170', '147.32.84.134', '147.32.84.164', '147.32.87.36', '147.32.80.9'])

all_IPs = np.concatenate(([train_botnet_IP], botnet_IPs, normal_IPs))

print(all_IPs)

df_all_IPs = df[df['SrcIPAddr'].isin(all_IPs)]

df_all_IPs['SrcIPAddr'].value_counts()

['147.32.84.165' '147.32.84.209' '147.32.84.208' '147.32.84.207'
 '147.32.84.204' '147.32.84.206' '147.32.84.192' '147.32.84.191'
 '147.32.84.193' '147.32.84.205' '147.32.84.170' '147.32.84.134'
 '147.32.84.164' '147.32.87.36' '147.32.80.9']


147.32.84.170    3510
147.32.84.164    2451
147.32.84.134     755
147.32.87.36      336
147.32.84.165     119
147.32.84.209     101
147.32.84.208     100
147.32.84.207      98
147.32.84.204      96
147.32.84.206      96
147.32.84.191      93
147.32.84.192      93
147.32.84.193      87
147.32.84.205      86
147.32.80.9        68
Name: SrcIPAddr, dtype: int64

In [4]:
#elbow value identified from the above graphs
def get_elbows(df_numerical):
    elbow = {'Durat': 3, 'Packets': 3, 'Bytes': 3}
    for column in df_numerical:
        X = df_numerical[column].values.reshape(-1, 1)
        kmeans = KMeans(n_clusters=elbow[column], random_state=0).fit(X)
        df_numerical[column] = kmeans.labels_
    return df_numerical

In [5]:
def discretize(df):
    df_numerical = df[['Durat', 'Packets', 'Bytes']]
    df_numerical = get_elbows(df_numerical)
    df_text = df[['Prot', 'Flags', 'Tos', 'SrcIPPort', 'DstIPAddr', 'DstIPPort']] 
    for column in df_text:
        labels, levels = pd.factorize(df_text[column])
        df_text[column] = labels

    df_discrete = pd.concat([df_text, df_numerical], axis=1)
    df_discrete = df_discrete[['Bytes', 'Packets', 'Flags', 'Durat']] #, 'Prot','Durat'
    code = 0
    spaceSize = 1
    for column in df_discrete:
        spaceSize = spaceSize * df_discrete[column].value_counts(dropna=True).count()

    codes = np.array([])
    spaceSizeCurrent = spaceSize
    for index, row in df_discrete.iterrows():
        for column in df_discrete:
            if (row[column] >= 0):
                code  = code + (row[column] * spaceSizeCurrent / df_discrete[column].value_counts(dropna=True).count())
                spaceSizeCurrent =  spaceSizeCurrent / df_discrete[column].value_counts(dropna=True).count()

        codes = np.append(codes,code)
        spaceSizeCurrent = spaceSize
        code = 0


    df_discrete['codes'] = codes
    df = pd.concat([df_discrete,df[['SrcIPAddr','Label']]], axis=1)
    return df

In [6]:
discret_df = discretize(df_all_IPs)
discret_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,Bytes,Packets,Flags,Durat,codes,SrcIPAddr,Label
127,0,0,0,1,1.0,147.32.84.164,LEGITIMATE
610,0,0,1,0,3.0,147.32.84.164,LEGITIMATE
727,0,0,2,2,8.0,147.32.84.164,LEGITIMATE
1268,0,0,0,0,0.0,147.32.84.164,LEGITIMATE
1302,0,0,0,0,0.0,147.32.84.164,LEGITIMATE


In [7]:
discret_df.groupby(['Bytes', 'Label']).size()

Bytes  Label     
0      Botnet        1029
       LEGITIMATE    7013
1      LEGITIMATE      11
2      LEGITIMATE      36
dtype: int64

In [8]:
def n2_gram_model(n_grams):
    model = dict.fromkeys(n_grams)

    for i in range(len(n_grams)-1):
        if model[n_grams[i]] is None:
            model[n_grams[i]] = {}
        if n_grams[i+1] not in model[n_grams[i]]:
            model[n_grams[i]][n_grams[i+1]] = 1.0
        model[n_grams[i]][n_grams[i + 1]] += 10

    for key in model:
        if model[key] is None:
            continue
        total_count = float(sum(model[key].values()))
        for w2 in model[key]:
            model[key][w2] /= total_count

    return model


## Laplace (add-one) smoothing -- merely adds one to each count 
def n2_gram_model_based(n_grams, base):
    model = dict.fromkeys(base)
    for key in base:
        model[key] = {}
        for w in base[key]:
            model[key][w] = 1.0
    for i in range(len(n_grams)-1):
        if n_grams[i] not in model:
            model[n_grams[i]] = {}
        if n_grams[i+1] not in model[n_grams[i]]:
            model[n_grams[i]][n_grams[i+1]] = 1.0
        model[n_grams[i]][n_grams[i + 1]] += 10

    for key in model:
        if model[key] is None:
            continue
        total_count = float(sum(model[key].values()))
        for w2 in model[key]:
            model[key][w2] /= total_count

    return model

def n2_gram_predict(model, n_grams, threshold):
    probalilites = []
    true_prob = []
    for i in range(len(n_grams)-1):
        if n_grams[i] not in model or n_grams[i+1] not in model[n_grams[i]]:
            prob = 0
        else:
            prob = model[n_grams[i]][n_grams[i+1]]
        true_prob.append(prob)
        if prob > threshold:
            true_prob.append(prob)
            probalilites.append(1)
        else:
            probalilites.append(0)
    return true_prob, np.array(probalilites).transpose()

In [9]:
def n3_gram_model(n_grams):
    model = {}
    for i in range(len(n_grams)-2):
        key = (n_grams[i], n_grams[i+1])
        w3 = n_grams[i+2]
        if key not in model:
            model[key] = {}
        if w3 not in model[key]:
            model[key][w3] = 1.0
        model[key][w3] = model[key][w3] + 10
    for key in model:
        total_count = float(sum(model[key].values()))
        for w3 in model[key]:
            model[key][w3] /= total_count

    return model

## Laplace (add-one) smoothing -- merely adds one to each count 
def n3_gram_model_based(n_grams, base):
    model = dict.fromkeys(base)
    for key in base:
        model[key] = {}
        for w in base[key]:
            model[key][w] = 1.0
    for i in range(len(n_grams)-2):
        key = (n_grams[i], n_grams[i+1])
        w3 = n_grams[i+2]
        if key not in model:
            model[key] = {}
        if w3 not in model[key]:
            model[key][w3] = 1.0
        model[key][w3] = model[key][w3] + 10
    for key in model:
        total_count = float(sum(model[key].values()))
        for w3 in model[key]:
            model[key][w3] /= total_count

    return model

def n3_gram_predict(model, n_grams, threshold):
    probalilites = []
    true_prob = []
    for i in range(len(n_grams)-2):
        key = (n_grams[i], n_grams[i+1])
        w3 = n_grams[i+2]
        if key not in model or w3 not in model[key]:
#             print(key)
            prob = 0
        else:
            prob = model[key][w3]
        true_prob.append(prob)
        if prob > threshold:
            probalilites.append(1)
        else:
            probalilites.append(0)
    return true_prob, np.array(probalilites).transpose()

In [10]:
def normilize(base):
    ret_model = dict.fromkeys(base)
    length = len(ret_model.values())
    sum = 0
    for key in base:
        ret_model[key] = {}
        for w in base[key]:
            ret_model[key][w] = base[key][w] / length
            sum += ret_model[key][w]
    return ret_model
    
def KL_divergence(p, q):
    q = normilize(q)
    dist = 0
    for key in p:     
        for w in p[key]:
            dist += (p[key][w] * np.log(p[key][w]/q[key][w]))
    return dist
            

## N2 gram


In [136]:
model = n2_gram_model(discret_df[discret_df['SrcIPAddr'] == train_botnet_IP]['codes'].values)
print(model)

threshold = 0.005

{9.0: {9.0: 0.3263157894736842, 39.0: 0.22105263157894736, 11.0: 0.22105263157894736, 24.0: 0.11578947368421053, 25.0: 0.11578947368421053}, 39.0: {9.0: 1.0}, 11.0: {30.0: 0.5, 9.0: 0.5}, 30.0: {30.0: 0.9420131291028446, 9.0: 0.022975929978118162, 0.0: 0.012035010940919038, 6.0: 0.022975929978118162}, 24.0: {30.0: 1.0}, 25.0: {24.0: 1.0}, 0.0: {0.0: 0.65625, 8.0: 0.34375}, 8.0: {8.0: 0.5, 6.0: 0.5}, 6.0: {6.0: 0.6612903225806451, 30.0: 0.3387096774193548}}


In [137]:
norm_model = normilize(model)
for ip in botnet_IPs:
    test_model = discret_df[discret_df['SrcIPAddr'] == ip]['codes'].values
    true_prob, prob = n2_gram_predict(model, test_model, threshold)
    test_ngram = n2_gram_model_based(test_model, model)
    dist = KL_divergence(norm_model, test_ngram)
    print(sum(true_prob)/len(true_prob), dist)

0.7989261904858567 0.3322647752931
0.8025219670073066 0.2217563177196304
0.8148774721935385 0.39254355988801803
0.8219216162811699 0.2916172686646126
0.8253523549979814 0.22376768989199136
0.8298263207946293 0.2582376089442656
0.83346759675127 0.39693980400587303
0.8577282587410933 0.29092334839857015
0.8630984753937424 0.26452543080847957


In [138]:
for ip in normal_IPs:
    test_model = discret_df[discret_df['SrcIPAddr'] == ip]['codes'].values
    true_prob, prob = n2_gram_predict(model, test_model, threshold)
    test_ngram = n2_gram_model_based(test_model, model)
    dist = KL_divergence(norm_model, test_ngram)
    print(sum(true_prob)/len(true_prob), dist)

0.19584392179991353 3.4736338243880995
0.4770598627358514 1.6891206245595158
0.3227817689164173 3.2404925264757143
0.05834137487363295 2.920779814631303
0.3263157894736841 0.4516502192274682


## N3 gram

In [139]:
model = n3_gram_model(discret_df[discret_df['SrcIPAddr'] == train_botnet_IP]['codes'].values)
print(model)
threshold = 0.005

{(9.0, 9.0): {39.0: 0.65625, 24.0: 0.34375}, (9.0, 39.0): {9.0: 1.0}, (39.0, 9.0): {11.0: 1.0}, (9.0, 11.0): {30.0: 0.5, 9.0: 0.5}, (11.0, 30.0): {30.0: 1.0}, (30.0, 30.0): {30.0: 0.9502314814814815, 9.0: 0.01273148148148148, 0.0: 0.01273148148148148, 6.0: 0.024305555555555556}, (30.0, 9.0): {9.0: 0.5, 25.0: 0.5}, (11.0, 9.0): {9.0: 1.0}, (9.0, 24.0): {30.0: 1.0}, (24.0, 30.0): {9.0: 0.5, 30.0: 0.5}, (9.0, 25.0): {24.0: 1.0}, (25.0, 24.0): {30.0: 1.0}, (30.0, 0.0): {0.0: 1.0}, (0.0, 0.0): {0.0: 0.5, 8.0: 0.5}, (0.0, 8.0): {8.0: 1.0}, (8.0, 8.0): {6.0: 1.0}, (8.0, 6.0): {6.0: 1.0}, (6.0, 6.0): {30.0: 0.34375, 6.0: 0.65625}, (6.0, 30.0): {30.0: 1.0}, (30.0, 6.0): {30.0: 0.5, 6.0: 0.5}}


In [141]:
norm_model = normilize(model)
print('MLE           KL')
for ip in botnet_IPs:
    test_model = discret_df[discret_df['SrcIPAddr'] == ip]['codes'].values
    true_prob, prob = n3_gram_predict(model, test_model, threshold)
    test_ngram = n3_gram_model_based(test_model, model)
    dist = KL_divergence(norm_model, test_ngram)
    print(sum(true_prob)/len(prob), dist)


MLE           KL
0.7792625327347548 0.78555249331489
0.8041147014361298 0.6841539319886952
0.8000337577160491 0.5264875080504203
0.8276940504334119 0.4734733037386963
0.8383323483057523 0.4409439254316388
0.8549806674806673 0.4092335064197044
0.8656262718762715 0.3589649589238714
0.8895969498910673 0.3606675288404676
0.876515652557319 0.3774147984901173


In [143]:
for ip in normal_IPs:
    test_model = discret_df[discret_df['SrcIPAddr'] == ip]['codes'].values
    true_prob, prob = n3_gram_predict(model, test_model, threshold)
    test_ngram = n3_gram_model_based(test_model, model)
    dist = KL_divergence(norm_model, test_ngram)
    print(sum(true_prob)/len(prob), dist)


0.026608822690992018 3.0082585121577035
0.2450199203187251 1.5913795677527551
0.11359228256431196 3.3576284082063954
0.012443862275449101 2.495675599753367
0.0 0.3519633650302155
