# CSE-CIC-IDS 2017 K Nearest Neighbors Classifier

In [1]:
model_id = "knearestneighbors-distance"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob

In [3]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [4]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_grouped_1henc.csv")

# Split data into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [7]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [8]:
del x_valtest 
del y_valtest

In [9]:
print("Train:")
print(y_train.sum(axis=0))
print("\nValidation:")
print(y_val.sum(axis=0))
print("\nTest:")
print(y_test.sum(axis=0))

Train:
BENIGN          1364100
Botnet             1185
Brute Force        8273
DoS/DDoS         228292
Infiltration         24
PortScan          95267
Web Attack         1304
dtype: int64

Validation:
BENIGN          454207
Botnet             397
Brute Force       2810
DoS/DDoS         76510
Infiltration         7
PortScan         31787
Web Attack         431
dtype: int64

Test:
BENIGN          454790
Botnet             384
Brute Force       2752
DoS/DDoS         75897
Infiltration         5
PortScan         31876
Web Attack         445
dtype: int64


# Train model

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
clf = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=12).fit(x_train, y_train)

2:12 to ?

**Save model**

In [12]:
import pickle

In [13]:
import time

model_filename = model_id  + " [" + time.strftime("%Y%m%d %H%M") + "]"

In [14]:
save_file = open(NOTEBOOK_PATH + "Models/" + model_filename + ".pkl", "wb")
saved_model = pickle.dump(clf, save_file)
save_file.close()

**Test model**

In [16]:
pred = clf.predict(x_val)

In [17]:
pred_index = list(pred.argmax(1))
y_index = list(np.argmax(y_val.to_numpy(), axis=1))

atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,453759,100,4,54,5,3,6,453931
Botnet,119,297,0,0,0,0,0,416
Brute Force,5,0,2806,0,0,0,0,2811
DoS/DDoS,127,0,0,76454,0,5,4,76590
Infiltration,0,0,0,0,2,0,0,2
PortScan,193,0,0,1,0,31775,0,31969
Web Attack,4,0,0,1,0,4,421,430
All,454207,397,2810,76510,7,31787,431,566149


# Test Set

In [15]:
from sklearn.metrics import f1_score

In [16]:
pred = clf.predict(x_test)
pred_index = list(pred.argmax(1))
y_index = list(np.argmax(y_test.to_numpy(), axis=1))
f1_micro = f1_score(y_index, pred_index, average='micro')
f1_macro = f1_score(y_index, pred_index, average='macro')

print(f1_micro)
print(f1_macro)

0.9987812395676756
0.8709402358871187


In [17]:
atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix.to_csv(NOTEBOOK_PATH + "Confusion Matrices/test_" + model_filename + ".csv")
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,454378,77,8,50,4,125,3,454645
Botnet,91,307,0,0,0,0,0,398
Brute Force,5,0,2744,0,0,0,1,2750
DoS/DDoS,122,0,0,75844,0,4,2,75972
Infiltration,0,0,0,0,1,0,0,1
PortScan,192,0,0,1,0,31746,0,31939
Web Attack,2,0,0,2,0,1,439,444
All,454790,384,2752,75897,5,31876,445,566149


# Log results

In [18]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()