In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from mlxtend.preprocessing import TransactionEncoder
import random
from mlxtend.frequent_patterns import apriori
#from apyori import apriori 

In [2]:
#load data

client = boto3.client('s3')
obj = client.get_object(Bucket='manifolddata', Key='week1.csv')
df = pd.read_csv(BytesIO(obj['Body'].read()))

#df=pd.read_csv('week1processed.csv',  parse_dates=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df=df.iloc[:,[0,1,3,4,5,6,7,8]]
df.columns=['Date', 'Duration', 'Src_IP', 'Src_pt', 'Dst_IP', 'Dst_pt','Packets', 'Bytes']
#add an date column that is rounded to nearest hour, so we can use this as a timestep to see how frequently IP pairs occur in each timestep
df['Date']=pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S.%f", errors = 'coerce')
df['date_hr']=pd.Series(df['Date']).dt.round("H")

In [4]:
#create a pair column, which is a touple of the src and dst IP, sorted. 
#It does not matter which call came first, we simply want to know which pair occurs most frequently.

df['pairs']=list(zip(df.Src_IP, df.Dst_IP))
df['pairs']=df['pairs'].apply(sorted)
df['pairs2']=tuple(df['pairs'])

In [5]:
df.head()

Unnamed: 0,Date,Duration,Src_IP,Src_pt,Dst_IP,Dst_pt,Packets,Bytes,date_hr,pairs,pairs2
0,2017-08-02 00:00:00.419,0.003,192.168.210.55,44870,192.168.100.11,445.0,2,174,2017-08-02,"[192.168.100.11, 192.168.210.55]","(192.168.100.11, 192.168.210.55)"
1,2017-08-02 00:00:00.421,0.0,192.168.100.11,445,192.168.210.55,44870.0,1,108,2017-08-02,"[192.168.100.11, 192.168.210.55]","(192.168.100.11, 192.168.210.55)"
2,2017-08-02 00:00:02.593,0.004,192.168.220.47,55101,192.168.100.11,445.0,2,174,2017-08-02,"[192.168.100.11, 192.168.220.47]","(192.168.100.11, 192.168.220.47)"
3,2017-08-02 00:00:02.859,0.0,10000_34,443,192.168.210.54,59628.0,1,100,2017-08-02,"[10000_34, 192.168.210.54]","(10000_34, 192.168.210.54)"
4,2017-08-02 00:00:02.594,0.0,192.168.100.11,445,192.168.220.47,55101.0,1,108,2017-08-02,"[192.168.100.11, 192.168.220.47]","(192.168.100.11, 192.168.220.47)"


## Implement Frequent Pattern (FP) - Growth algorithm

https://fp-growth.readthedocs.io/en/latest/usage.html

This algorithm
1. Counts occurence of items in dataset in 1st pass
2. Build FP-tree by inserting instances

This allows the frequentest set to be generated organically, instead of creating a list of each itemset and checking if it does/does not pass minimum threshold (like Apriori algorithm)

#### Vocabulary
 - itemset = all items in 1 transaction
A "pattern" is a conjunction of items, or the unique itemset.
A "rule" X --> Y means if you buy X you are likely to buy Y, or in this case if X IP address is used Y IP address is likely also


#### Evaluation metrics include:

1. Support = how frequently it occurs. The number of transactions of that unique itemset / all transactions. In this case: number of times IP pair occurs/all requests

2. Confidence = how often rule is likely to be true. frequency of X and Y occuring/ frequency of X occuring in entire dataset. Conditional probability of Y given X. P(Ey|Ex)

3. Lift = How likely is item Y given item X occurs, controlling for how frequent Y occurs in the entire dataset. For rule X-->Y, lift = P(Y|X)/P(Y).
    Lift = 1 means X and Y are independent
    Lift >1 = X and Y are positively correlated
    Lift <1 = X and Y are negatively correlated

In [6]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [7]:
dataset

[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
 ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
 ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

Loop through in random chuncks, continue until no rules added

In [7]:
#we only want a list of all the individual pairs at each timestamp. Think of this where each timestamp is a 'transaction' and we chose to buy which 2 items (IP addresses)
data_l=list(df['pairs'])

In [8]:
#wants things in a 1 hot encoded pandas dataframe for the library models we want to run. use Sprase data
te = TransactionEncoder()
te_ary = te.fit(data_l).transform(data_l, sparse=True)
data_coded = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False)
data_coded.head()


Unnamed: 0,0.0.0.0,10000_163,10000_34,10000_35,10000_40,10000_42,10000_66,10000_67,10000_72,10000_74,...,192.168.220.44,192.168.220.45,192.168.220.46,192.168.220.47,192.168.220.48,192.168.220.49,192.168.220.50,192.168.220.51,255.255.255.255,DNS
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [9]:
data_subset=data_coded.sample(n=150000)
data_subset2=data_coded.sample(n=150000)
data_subset3=data_coded.sample(n=150000)
data_subset4=data_coded.sample(n=150000)

In [32]:
data_subset3=data_coded.sample(n=150000)
data_subset4=data_coded.sample(n=150000)

In [100]:
ap1=apriori(data_subset, min_support=0.001,  use_colnames=True)
ap2=apriori(data_subset2, min_support=0.001,  use_colnames=True)

In [101]:
ap3=apriori(data_subset3, min_support=0.001, use_colnames=True)
ap4=apriori(data_subset4, min_support=0.001, use_colnames=True)

In [102]:
for i in ap2['itemsets']:
    if i not in list(ap1.itemsets):
        print(ap2[ap2['itemsets']==i])
        ap1.append(ap2[ap2['itemsets']==i])

     support     itemsets
25  0.001033  (10028_162)
     support     itemsets
29  0.001147  (10030_132)
     support     itemsets
68  0.001013  (10085_210)
     support    itemsets
77  0.001107  (10098_60)
     support    itemsets
83  0.001027  (10123_90)
     support   itemsets
106  0.00104  (10312_7)
      support    itemsets
141  0.001067  (14170_51)
      support                    itemsets
190  0.001087  (10042_29, 192.168.220.50)
      support                    itemsets
198  0.001187  (192.168.220.50, 10048_27)
      support                   itemsets
201  0.001027  (10056_1, 192.168.220.51)
      support                    itemsets
226  0.001067  (14170_51, 192.168.220.51)


In [103]:
for i in ap3['itemsets']:
    if i not in list(ap1.itemsets):
        print(ap3[ap3['itemsets']==i])
        ap1.append(ap3[ap3['itemsets']==i])

     support     itemsets
23  0.001107  (10028_162)
    support     itemsets
27  0.00116  (10030_132)
     support     itemsets
36  0.001047  (10045_159)
    support    itemsets
37    0.001  (10045_17)
     support     itemsets
78  0.001093  (10104_100)
      support   itemsets
102  0.001033  (10312_7)
      support    itemsets
135  0.001013  (14170_51)
      support     itemsets
136  0.001153  (14185_238)
     support                    itemsets
186  0.00112  (10042_29, 192.168.220.50)
      support                    itemsets
194  0.001113  (192.168.220.50, 10048_27)
      support                    itemsets
212  0.001087  (192.168.220.47, 10286_33)
      support                    itemsets
224  0.001013  (14170_51, 192.168.220.51)
      support                     itemsets
225  0.001153  (14185_238, 192.168.220.51)


In [104]:
for i in ap4['itemsets']:
    if i not in list(ap1.itemsets):
        print(ap4[ap4['itemsets']==i])
        ap1.append(ap4[ap4['itemsets']==i])

    support     itemsets
24    0.001  (10028_162)
    support     itemsets
28  0.00118  (10030_132)
     support   itemsets
99  0.001073  (10312_7)
      support    itemsets
116  0.001087  (10481_10)
      support    itemsets
134  0.001067  (14170_51)
      support                    itemsets
170  0.001067  (10002_14, 192.168.220.47)
      support                    itemsets
185  0.001007  (10042_29, 192.168.220.49)
      support                    itemsets
186  0.001107  (10042_29, 192.168.220.50)
      support                    itemsets
194  0.001133  (192.168.220.50, 10048_27)
     support                   itemsets
196  0.00102  (10056_1, 192.168.220.51)
      support                    itemsets
203  0.001173  (192.168.220.51, 10085_98)
      support                    itemsets
214  0.001147  (192.168.220.47, 10286_33)
      support                   itemsets
216  0.001087  (10298_9, 192.168.220.47)
      support                    itemsets
227  0.001067  (14170_51, 192.168.220.51

This shows that even though we take 4 random subsamples and conduct the Apriori algorithm, we are still adding new itemsets. Ideally, we would keep going until we are adding no new itemsets. But, for simplicity sake let's move on.
This did apply a threshold of support = 0.001 or the itemset had to appear at minimum 0.1% in the data.

We only care about itemsets with 2 values, so we will filter out the single items

In [None]:
ap1.head()

In [93]:
ap1_test=ap1
ap1_test['length'] = ap1_test['itemsets'].apply(lambda x: len(x))


In [None]:
ap1.head()

In [95]:
ap1

Unnamed: 0,support,itemsets,length
0,0.001073,(10000_34),1
1,0.001047,(10000_66),1
2,0.001420,(10001_213),1
3,0.003073,(10002_14),1
4,0.001113,(10002_162),1
5,0.001300,(10002_2),1
6,0.002973,(10003_224),1
7,0.002907,(10003_226),1
8,0.001213,(10008_24),1
9,0.008173,(10008_33),1


### Generate Rules from the most frequent items we found

In [96]:
#remove length column
ap1=ap1[['support', 'itemsets']]
ap1

Unnamed: 0,support,itemsets
0,0.001073,(10000_34)
1,0.001047,(10000_66)
2,0.001420,(10001_213)
3,0.003073,(10002_14)
4,0.001113,(10002_162)
5,0.001300,(10002_2)
6,0.002973,(10003_224)
7,0.002907,(10003_226)
8,0.001213,(10008_24)
9,0.008173,(10008_33)


In [97]:
from mlxtend.frequent_patterns import association_rules

rules_001=association_rules(ap1,  metric="support", min_threshold=0.01)

In [99]:
rules_001

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(192.168.220.43),(10008_33),0.042960,0.008173,0.006120,0.142458,17.429621,0.005769,1.156593
1,(10008_33),(192.168.220.43),0.008173,0.042960,0.006120,0.748777,17.429621,0.005769,3.809516
2,(10008_33),(192.168.220.45),0.008173,0.061187,0.001027,0.125612,2.052927,0.000527,1.073680
3,(192.168.220.45),(10008_33),0.061187,0.008173,0.001027,0.016779,2.052927,0.000527,1.008753
4,(10008_9),(192.168.210.55),0.022840,0.060747,0.020913,0.915645,15.073174,0.019526,11.134540
5,(192.168.210.55),(10008_9),0.060747,0.022840,0.020913,0.344271,15.073174,0.019526,1.490189
6,(10008_9),(192.168.220.46),0.022840,0.027393,0.001053,0.046118,1.683545,0.000428,1.019630
7,(192.168.220.46),(10008_9),0.027393,0.022840,0.001053,0.038452,1.683545,0.000428,1.016237
8,(192.168.220.51),(10012_14),0.219620,0.005567,0.001180,0.005373,0.965195,-0.000043,0.999805
9,(10012_14),(192.168.220.51),0.005567,0.219620,0.001180,0.211976,0.965195,-0.000043,0.990300


now we will set some thresholds for creating rules and go through our first chunk of data


min_support : float (default: 0.5)
A float between 0 and 1 for minumum support of the itemsets returned. The support is computed as the fraction transactions_where_item(s)_occur / total_transactions.

We don't want to bother making rules for infrequent items, so we will set the support = 0.5 meaning this IP address is present in 50% of the requests


In [13]:
from mlxtend.frequent_patterns import apriori

ap1=apriori(data1_coded, min_support=0.05)


In [39]:
ap1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
support     6 non-null float64
itemsets    6 non-null object
dtypes: float64(1), object(1)
memory usage: 176.0+ bytes


In [21]:
ap1.itemsets[3]==({4855})

True

In [22]:
if ({4855}) in ap1.itemsets:
    print (ap1.itemsets)

TypeError: unhashable type: 'set'

In [24]:
df_pairs=df[['Date','pairs']]
df_pairs=pd.DataFrame(df_pairs)
df_pairs.columns

Index(['Date', 'pairs'], dtype='object')

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df_pairs2 = df_pairs.join(pd.DataFrame(mlb.fit_transform(df_pairs.pop('pairs')),
                          columns=mlb.classes_,
                          index=df_pairs.index))

MemoryError: 