In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from mlxtend.preprocessing import TransactionEncoder

In [3]:
#load data
df=pd.read_csv('week1processed.csv',  parse_dates=True)

In [4]:
#add an date column that is rounded to nearest hour, so we can use this as a timestep to see how frequently IP pairs occur in each timestep
df['Date']=pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S.%f", errors = 'coerce')
df['date_hr']=pd.Series(df['Date']).dt.round("H")

In [5]:
#create a pair column, which is a touple of the src and dst IP, sorted. 
#It does not matter which call came first, we simply want to know which pair occurs most frequently.

df['pairs']=list(zip(df.Src_IP, df.Dst_IP))
df['pairs']=df['pairs'].apply(sorted)
df['pairs2']=tuple(df['pairs'])

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Duration,Src_IP,Src_pt,Dst_IP,Dst_pt,Packets,Bytes,date_hr,pairs,pairs2
0,0,2017-08-02 00:00:00.419,0.003,192.168.210.55,44870,192.168.100.11,445.0,2,174,2017-08-02,"[192.168.100.11, 192.168.210.55]","(192.168.100.11, 192.168.210.55)"
1,1,2017-08-02 00:00:00.421,0.0,192.168.100.11,445,192.168.210.55,44870.0,1,108,2017-08-02,"[192.168.100.11, 192.168.210.55]","(192.168.100.11, 192.168.210.55)"
2,2,2017-08-02 00:00:02.593,0.004,192.168.220.47,55101,192.168.100.11,445.0,2,174,2017-08-02,"[192.168.100.11, 192.168.220.47]","(192.168.100.11, 192.168.220.47)"
3,3,2017-08-02 00:00:02.859,0.0,10000_34,443,192.168.210.54,59628.0,1,100,2017-08-02,"[10000_34, 192.168.210.54]","(10000_34, 192.168.210.54)"
4,4,2017-08-02 00:00:02.594,0.0,192.168.100.11,445,192.168.220.47,55101.0,1,108,2017-08-02,"[192.168.100.11, 192.168.220.47]","(192.168.100.11, 192.168.220.47)"


In [22]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [23]:
dataset

[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
 ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
 ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [24]:
#wants things in a 1 hot encoded pandas dataframe for the library models we want to run
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
data_coded = pd.DataFrame(te_ary, columns=te.columns_)
data_coded

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [24]:
dataset[1][1]

'Onion'

In [16]:
#we only want a list of all the individual pairs at each timestamp. Think of this where each timestamp is a 'transaction' and we chose to buy which 2 items (IP addresses)
data_l=list(df['pairs'])

In [17]:
data_l

[['192.168.100.11', '192.168.210.55'],
 ['192.168.100.11', '192.168.210.55'],
 ['192.168.100.11', '192.168.220.47'],
 ['10000_34', '192.168.210.54'],
 ['192.168.100.11', '192.168.220.47'],
 ['10000_34', '192.168.210.54'],
 ['192.168.100.11', '192.168.220.42'],
 ['192.168.100.11', '192.168.220.42'],
 ['192.168.100.11', '192.168.220.46'],
 ['192.168.100.11', '192.168.220.46'],
 ['192.168.100.11', '192.168.210.46'],
 ['192.168.100.11', '192.168.210.46'],
 ['192.168.100.11', '192.168.210.46'],
 ['192.168.100.11', '192.168.210.46'],
 ['192.168.100.11', '192.168.220.43'],
 ['192.168.100.11', '192.168.220.43'],
 ['192.168.100.11', '192.168.220.44'],
 ['192.168.100.11', '192.168.220.44'],
 ['192.168.100.11', '192.168.210.50'],
 ['192.168.100.11', '192.168.210.50'],
 ['192.168.100.11', '192.168.220.44'],
 ['192.168.100.11', '192.168.220.44'],
 ['192.168.100.11', '192.168.220.48'],
 ['192.168.100.11', '192.168.220.48'],
 ['192.168.100.11', '192.168.220.44'],
 ['192.168.100.11', '192.168.220.44']

In [18]:
type(data_l)

list

In [9]:
df_pairs=df['pairs']
df_pairs=pd.DataFrame(df_pairs)
df_pairs.columns

Index(['pairs'], dtype='object')

In [42]:
df_pairs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8185992 entries, 0 to 8185991
Empty DataFrame

In [12]:
v = df_pairs.pairs.values
l = [len(x) for x in v.tolist()]
f, u = pd.factorize(np.concatenate(v))
n, m = len(v), u.size
i = np.arange(n).repeat(l)

dummies = pd.DataFrame(
    np.bincount(i * m + f, minlength=n * m).reshape(n, m),
    df.index, u
)

df_pairs.drop('pairs', 1).join(dummies)

MemoryError: 

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df_pairs2 = df_pairs.join(pd.DataFrame(mlb.fit_transform(df_pairs.pop('pairs')),
                          columns=mlb.classes_,
                          index=df_pairs.index))

MemoryError: 

In [25]:
#wants things in a 1 hot encoded pandas dataframe for the library models we want to run
te = TransactionEncoder()
te_ary = te.fit(data_l).transform(data_l)
data_coded = pd.DataFrame(te_ary, columns=te.columns_)
data_coded

MemoryError: 