# CSE-CIC-IDS 2017

In [2]:
model_id = "gradientboostedtrees2-multiclass-max_depth3"

In [3]:
import numpy as np
np.random.seed(42)
np.set_printoptions(suppress=True)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob, pickle, time, sklearn

In [4]:
print(sklearn.__version__)


0.21.3


In [5]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [6]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_grouped_1henc.csv")

# Split data into train and test

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [9]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [10]:
del x_train
del y_train

del x_valtest 
del y_valtest

# ADASYN

In [11]:
x_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_adasyn.csv").to_numpy()

In [12]:
y_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_adasyn.csv")

In [13]:
y_train_res = y_train_res[y_train_res.columns[0]].to_numpy()

# Train model

In [1]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3), n_estimators=50, random_state=42)

In [15]:
%%time
clf.fit(x_train_res, y_train_res)

Wall time: 1h 31min 24s


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

**Save model**

In [16]:
import pickle

In [17]:
import time

model_filename = model_id  + " [" + time.strftime("%Y%m%d %H%M") + "]"

In [19]:
save_file = open(NOTEBOOK_PATH + "Models/" + model_filename + ".pkl", "wb")
saved_model = pickle.dump(clf, save_file)
save_file.close()

**Test model**

In [20]:
pred = clf.predict(x_val)

In [21]:
pred_index = pred
y_index = list(np.argmax(y_val.to_numpy(), axis=1))

atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,443441,243,1429,15350,0,15659,23,476145
Botnet,1452,154,0,0,0,0,0,1606
Brute Force,4,0,1380,0,0,0,0,1384
DoS/DDoS,6833,0,0,60894,0,39,34,67800
Infiltration,0,0,0,0,7,0,0,7
PortScan,2256,0,1,7,0,16088,1,18353
Web Attack,221,0,0,259,0,1,373,854
All,454207,397,2810,76510,7,31787,431,566149


# F1 Score

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [23]:
pred = clf.predict(x_test)
pred_index = list(pred)

In [24]:
y_index = list(np.argmax(y_test.to_numpy(), axis=1))
f1_micro = f1_score(y_index, pred_index, average='micro')
f1_macro = f1_score(y_index, pred_index, average='macro')

print(f1_micro)
print(f1_macro)

0.922278410807049
0.6763801853945411


In [25]:
atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
# matrix.to_csv(NOTEBOOK_PATH + "Confusion Matrices/test_" + model_filename + ".csv")
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,443923,244,1384,15258,1,15878,28,476716
Botnet,1426,140,0,0,0,0,0,1566
Brute Force,4,0,1368,0,0,0,0,1372
DoS/DDoS,6969,0,0,60374,0,49,27,67419
Infiltration,0,0,0,0,4,0,0,4
PortScan,2267,0,0,16,0,15948,0,18231
Web Attack,201,0,0,249,0,1,390,841
All,454790,384,2752,75897,5,31876,445,566149


# Log results

In [72]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()