# Data processing and Sampling

In [1]:
# modules used
from collections import Counter
import numpy as np
import pandas as pd
import pickle
import os
import re
import mmh3
import operator

In [2]:
# read data if not existing yet
# THIS TAKES A DECENT AMOUNT OF TIME
# USE THE .pkl FILE
if os.path.exists("data.pkl"):
    data = pd.read_pickle("data.pkl")
    
else:
    file = open('capture20110815.pcap.netflow.labeled', 'r')
    header = file.readline()
    header = ' '.join(header.split()).split(' ')
    rows_list = []
#     data = pd.DataFrame(columns=["Date", "Start", "Dur", "Prot", "Src", "PortSrc", "Dest", 
#                                  "PortDest", "Flags", "Tos", "Packets", "Bytes", "Flows", "Label"])
    for line in file:
        dict = {}
        words = ' '.join(line.split()).split(' ')
        # split ip adress on adress and port
        dest = words[6].split(':')
        src = words[4].split(':')
        dict["Date"] = words[0]
        dict["Start"] = words[1]
        dict["Dur"] = words[2]
        dict["Prot"] = words[3]
        dict["Src"] = src[0]
        if len(src) > 1:
            dict["PortSrc"] = src[1]
        else:
            dict["PortSrc"] = None
        dict["Dest"] = dest[0]
        if len(dest) > 1:
            dict["PortDest"] = dest[1]
        else:
            dict["PortDest"] = None
        if len(words) == 13:
            dict["Flags"] = words[7]
            dict["Tos"] = words[8]
            dict["Packets"] = words[9]
            dict["Bytes"] = words[10]
            dict["Flows"] = words[11]
            dict["Label"] = words[12]
        elif words[7] == "S_" or words[7] == "SRPA_":
            dict["Flags"] = words[7]
            dict["Tos"] = words[8]
            dict["Packets"] = words[9]
            dict["Bytes"] = words[10]
            dict["Flows"] = words[11]
            dict["Label"] = words[12]
            
        else:
            dict["Flags"] = None
            dict["Tos"] = words[7]
            dict["Packets"] = words[8]
            dict["Bytes"] = words[9]
            dict["Flows"] = words[10]
            dict["Label"] = words[11]
        rows_list.append(dict)
    data = pd.DataFrame(rows_list, columns=["Date", "Start", "Dur", "Prot", "Src", "PortSrc", "Dest", 
                                 "PortDest", "Flags", "Tos", "Packets", "Bytes", "Flows", "Label"])
    data.to_pickle("data.pkl")

data.head(100)

Unnamed: 0,Date,Start,Dur,Prot,Src,PortSrc,Dest,PortDest,Flags,Tos,Packets,Bytes,Flows,Label
0,Date,Start,Dur,Prot,Src,PortSrc,Dest,PortDest,Flags,Tos,Packets,Bytes,Flows,Label
1,2011-08-15,11:00:05.704,0.000,TCP,147.32.84.134,45266,205.188.10.203,443,A_,0,1,60,1,LEGITIMATE
2,2011-08-15,11:00:05.704,4.996,TCP,62.44.1.18,80,147.32.84.118,51463,PA_,0,58,84327,1,Background
3,2011-08-15,11:00:05.706,4.892,TCP,147.32.84.118,51463,62.44.1.18,80,A_,0,29,1740,1,Background
4,2011-08-15,11:00:05.707,0.000,UDP,147.32.80.9,53,147.32.85.103,18006,INT,0,1,175,1,Background
5,2011-08-15,11:00:05.709,0.000,UDP,147.32.85.103,41094,147.32.80.9,53,INT,0,1,71,1,Background
6,2011-08-15,11:00:05.712,0.000,UDP,147.32.80.9,53,147.32.85.103,41094,INT,0,1,152,1,Background
7,2011-08-15,11:00:05.715,0.000,UDP,147.32.85.103,27423,147.32.80.9,53,INT,0,1,73,1,Background
8,2011-08-15,11:00:05.716,4.115,TCP,212.194.183.145,49584,147.32.85.56,44076,PA_,0,12,891,1,Background
9,2011-08-15,11:00:05.718,4.113,TCP,147.32.85.56,44076,212.194.183.145,49584,PA_,0,7,477,1,Background


In [3]:
# print counts of all ip adresses
print data["Src"].value_counts()

147.32.84.229      4488
147.32.80.9        3487
147.32.84.59       3214
147.32.84.138      1333
147.32.84.118       831
147.32.86.50        428
147.32.85.34        235
147.32.86.20        231
147.32.85.25        222
147.32.86.165       217
147.32.86.187       217
147.32.86.194       195
147.32.85.56        177
147.32.85.103       174
147.32.84.130       161
147.32.84.184       149
147.32.86.182       143
133.1.74.163        136
147.32.86.155       116
147.32.80.13        104
147.32.86.168       102
147.32.85.8         102
147.32.84.170       101
147.32.84.171        97
147.32.85.114        95
147.32.85.30         94
74.125.232.213       88
147.32.86.186        86
74.125.39.125        85
147.32.85.123        81
                   ... 
221.146.124.208       1
85.251.70.15          1
124.40.51.146         1
219.126.158.29        1
213.112.243.52        1
77.248.249.31         1
110.74.204.66         1
115.78.96.124         1
147.32.84.227         1
188.242.78.244        1
175.39.72.198   

# Min-Wise Sampling

In [5]:
# assign random number in range [0, 1] to every row
# get k smallest items
# check ip frequency again
reservoir_sizes = [100, 1000, 10000, 100000]
chances = np.random.uniform(low=0.0, high=1.0, size=data.shape[0])
for size in reservoir_sizes:
    k_smallest = np.argsort(chances)[:size]
    sampled_data = data.loc[k_smallest, :]
    print sampled_data["Src"].value_counts()[:10]
# k_smallest = np.argsort(chances)[:k]
# sampled_data = data.loc[k_smallest, :]
# print sampled_data["Src"].value_counts()

147.32.84.229    19
147.32.80.9      11
147.32.84.59     11
147.32.84.138     5
147.32.85.114     2
147.32.86.187     2
147.32.86.168     2
147.32.84.118     2
147.32.84.68      2
147.32.86.50      1
Name: Src, dtype: int64
147.32.84.229    161
147.32.80.9      117
147.32.84.59     105
147.32.84.138     51
147.32.84.118     23
147.32.86.50      16
147.32.84.184      8
147.32.86.165      7
147.32.86.187      7
147.32.86.106      6
Name: Src, dtype: int64
147.32.84.229    1526
147.32.80.9      1190
147.32.84.59     1107
147.32.84.138     428
147.32.84.118     268
147.32.86.50      124
147.32.85.34       91
147.32.85.25       82
147.32.86.20       80
147.32.86.187      77
Name: Src, dtype: int64
147.32.84.229    4488
147.32.80.9      3487
147.32.84.59     3214
147.32.84.138    1333
147.32.84.118     831
147.32.86.50      428
147.32.85.34      235
147.32.86.20      231
147.32.85.25      222
147.32.86.187     217
Name: Src, dtype: int64


Count-min sketch

In [25]:
#Set the sketch parameters and create the table.
w = 10000
d = 100
cells = w*d
sketch = [[0]*w for i in range(d)]
nmbrIPs = len(data["Src"])

#Add all the ip's to the table.
for ip in data["Src"]:
    for seed in range(d):
            hash = mmh3.hash(ip, seed) % w
            sketch[seed][hash] += 1
            
#Retrieve the frequencies from the table.
#sketchFrequencies = np.empty(nmbrIPs)
sketchFrequencies = {}
#point = 0
for ip in data["Src"]:
    min = nmbrIPs + 1
    for seed in range(d):
        hash = mmh3.hash(ip, seed) % w
        if sketch[seed][hash] < min:
            min = sketch[seed][hash]
    #sketchFrequencies[point] = min
    sketchFrequencies[ip] = min
    #point += 1
    
#Look at 10 most common
#indices = sketchFrequencies.argsort()[-10:]
#print(data["Src"][indices], sketchFrequencies[indices])
sortedFreq = sorted(sketchFrequencies.items(), key=operator.itemgetter(1), reverse = True)
print(sortedFreq[:10])

[('147.32.84.229', 4488), ('147.32.80.9', 3487), ('147.32.84.59', 3214), ('147.32.84.138', 1333), ('147.32.84.118', 831), ('147.32.86.50', 428), ('147.32.85.34', 235), ('147.32.86.20', 231), ('147.32.85.25', 222), ('147.32.86.187', 217)]
