### Source Info

Source: http://archive.ics.uci.edu/ml/datasets/KDD+Cup+1999+Data

Data Description: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/kddcup.data_0_1_percent.csv', sep=',')
df.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,32,33,34,35,36,37,38,39,40,41
0,22650,1,tcp,smtp,SF,1022,389,0,0,0,...,175,0.64,0.02,0.01,0.02,0.0,0.0,0.0,0.0,normal.
1,5765,0,tcp,http,SF,376,285,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,241826,0,icmp,ecr_i,SF,1032,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.


### Data Wrangling & Define Variables

In [3]:
## Data Wrangling ##

labels = df['41']
label_counts = labels.value_counts()

# label must encompass > 5% of the data
mask = label_counts > df.shape[0] * 0.05

# get passing labels
valid_labels = label_counts.index[mask]
print('Passing labels =', valid_labels)

# wrangle dataset with passing labels only
model_df = df[df['41'].isin(valid_labels)]

# redefine labels
labels = model_df['41']

# df to use
model_df = model_df.drop(columns = ['Unnamed: 0', '0', '1', '2', '3'])
model_df.head(3)

Passing labels = Index(['smurf.', 'neptune.', 'normal.'], dtype='object')


Unnamed: 0,4,5,6,7,8,9,10,11,12,13,...,32,33,34,35,36,37,38,39,40,41
0,1022,389,0,0,0,0,0,1,0,0,...,175,0.64,0.02,0.01,0.02,0.0,0.0,0.0,0.0,normal.
1,376,285,0,0,0,0,0,1,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,1032,0,0,0,0,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.


In [4]:
## Defining global variables ##

# columns to use (numeric ones)
features = ['4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21',
           '22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40']

# specify the metrics column names to be modeled
to_model_columns = df.filter(items = features).columns
to_model_columns

Index(['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16',
       '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28',
       '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40'],
      dtype='object')

### kNN

Below are some functions to be used throughout the code

In [5]:
## subset ##

def subset(model, n):
    df = model[0:len(model)//n]
    labels = nndf['41']
    knn(df, to_model_columns, labels, 3, 'auto', 30, 'minkowski')

In [6]:
## kNN ##

import time
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

def knn(model_data, model_columns, labels, K, alg, ls, mtc):
    start_time = time.time()
    
    # set seed
    random_state = 0
    random.seed(random_state)
    np.random.seed(0)

    # split data into train/test sets
    X, y = model_data.select_dtypes(['number']), labels
    params = train_test_split(X, y, test_size = 1/4, random_state = random_state, shuffle = True, stratify = y)
    X_train, X_test, y_train, y_test = params

    # fit model
    model = KNeighborsClassifier(n_neighbors=K, algorithm=alg, leaf_size=ls, metric=mtc, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # predict on model_df
    pred = model.predict(model_data[model_columns])
    
    # compute f1-score
    score = f1_score(labels, pred, average='micro')
#     print(score) 

    # print time
    time_taken = str(round(time.time() - start_time,5))
#     print("Time taken: " + str(round(time.time() - start_time,2)) + " seconds") 
    
    return [score, time_taken]

In [75]:
r = knn(nndf, to_model_columns, nnlabels, 3, 'ball_tree', 3000, 'euclidean')
r[0]

0.9997856836690956

### Export Collected Data

In [7]:
# dicts
K = [1, 2, 3, 4, 5]
algorithm = ['ball_tree', 'kd_tree']
leaf_size = [30, 300, 3000]
metrics = ['minkowski', 'manhattan', 'euclidean']
Ks = {}
algs = {}
lss = {}
mets = {}
fs = {}
sec = {}

In [8]:
# subset (knn takes forever with the entire dataset)
ndf = model_df[0:len(model_df)//8]
nlabels = ndf['41']
ndf['41'].value_counts()

smurf.      35109
neptune.    13381
normal.     12168
Name: 41, dtype: int64

In [13]:
# collect data

count = 1
for k in K:
    for alg in algorithm:
        for lf in leaf_size:
            for mt in metrics:
                output = knn(ndf, to_model_columns, nlabels, k, alg, lf, mt)
                Ks[str(count)] = k
                algs[str(count)] = alg
                lss[str(count)] = lf
                mets[str(count)] = mt
                fs[str(count)] = output[0]
                sec[str(count)] = output[1]
                print(str(count)+' of 90 iterations completed. '+str(90-count)+' iterations left. Estimated time is '+str(round(float(output[1])*(90-count)//60,2))+' minutes.')
                count = count+1

1 of 90 iterations completed. 89 iterations left. Estimated time is 40.0 minutes.
2 of 90 iterations completed. 88 iterations left. Estimated time is 39.0 minutes.
3 of 90 iterations completed. 87 iterations left. Estimated time is 38.0 minutes.
4 of 90 iterations completed. 86 iterations left. Estimated time is 54.0 minutes.
5 of 90 iterations completed. 85 iterations left. Estimated time is 57.0 minutes.
6 of 90 iterations completed. 84 iterations left. Estimated time is 52.0 minutes.
7 of 90 iterations completed. 83 iterations left. Estimated time is 105.0 minutes.
8 of 90 iterations completed. 82 iterations left. Estimated time is 112.0 minutes.
9 of 90 iterations completed. 81 iterations left. Estimated time is 100.0 minutes.
10 of 90 iterations completed. 80 iterations left. Estimated time is 31.0 minutes.
11 of 90 iterations completed. 79 iterations left. Estimated time is 30.0 minutes.
12 of 90 iterations completed. 78 iterations left. Estimated time is 29.0 minutes.
13 of 90 i

In [14]:
score_data = {'K': Ks, 'Algorithm': algs, 'Leaf Size': lss, 'Metric': mets, 'f1-score': fs, 'Time': sec}

In [15]:
score_data

{'K': {'1': 1,
  '2': 1,
  '3': 1,
  '4': 1,
  '5': 1,
  '6': 1,
  '7': 1,
  '8': 1,
  '9': 1,
  '10': 1,
  '11': 1,
  '12': 1,
  '13': 1,
  '14': 1,
  '15': 1,
  '16': 1,
  '17': 1,
  '18': 1,
  '19': 2,
  '20': 2,
  '21': 2,
  '22': 2,
  '23': 2,
  '24': 2,
  '25': 2,
  '26': 2,
  '27': 2,
  '28': 2,
  '29': 2,
  '30': 2,
  '31': 2,
  '32': 2,
  '33': 2,
  '34': 2,
  '35': 2,
  '36': 2,
  '37': 3,
  '38': 3,
  '39': 3,
  '40': 3,
  '41': 3,
  '42': 3,
  '43': 3,
  '44': 3,
  '45': 3,
  '46': 3,
  '47': 3,
  '48': 3,
  '49': 3,
  '50': 3,
  '51': 3,
  '52': 3,
  '53': 3,
  '54': 3,
  '55': 4,
  '56': 4,
  '57': 4,
  '58': 4,
  '59': 4,
  '60': 4,
  '61': 4,
  '62': 4,
  '63': 4,
  '64': 4,
  '65': 4,
  '66': 4,
  '67': 4,
  '68': 4,
  '69': 4,
  '70': 4,
  '71': 4,
  '72': 4,
  '73': 5,
  '74': 5,
  '75': 5,
  '76': 5,
  '77': 5,
  '78': 5,
  '79': 5,
  '80': 5,
  '81': 5,
  '82': 5,
  '83': 5,
  '84': 5,
  '85': 5,
  '86': 5,
  '87': 5,
  '88': 5,
  '89': 5,
  '90': 5},
 'Algorithm':

In [16]:
import json

j_score_data = json.dumps(score_data, indent = 4)
  
with open('data/score.json', 'w') as f:
    f.write(j_score_data)