# CSE-CIC-IDS 2017

In [1]:
model_id = "gradientboostedtrees1"

In [2]:
import numpy as np
np.random.seed(42)
np.set_printoptions(suppress=True)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob

In [6]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [7]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_all_binary.csv")

# Split data into train and test

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [10]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [11]:
del x_train
del y_train

del x_valtest 
del y_valtest

# ADASYN

In [12]:
x_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_adasyn_binary.csv")

In [13]:
y_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_adasyn_binary.csv")['IsAttack']

# Train model

In [14]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [27]:
%%time
clf = GradientBoostingClassifier(n_estimators=300, random_state=42, verbose=2).fit(x_train_res, y_train_res)

      Iter       Train Loss   Remaining Time 
         1           1.2491           62.23m
         2           1.1369           62.82m
         3           1.0437           62.34m
         4           0.9535           61.88m
         5           0.8887           61.52m
         6           0.8288           61.25m
         7           0.7753           60.73m
         8           0.7298           60.33m
         9           0.6904           59.93m
        10           0.6523           59.74m
        11           0.6119           59.52m
        12           0.5751           59.34m
        13           0.5512           59.21m
        14           0.5224           58.99m
        15           0.4934           58.71m
        16           0.4779           58.57m
        17           0.4546           58.30m
        18           0.4350           58.09m
        19           0.4155           57.90m
        20           0.3986           57.64m
        21           0.3858           57.47m
        2

In [37]:
import copy

In [38]:
clf2 = copy.deepcopy(clf)

In [39]:
clf2.warm_start = True

In [40]:
clf2.n_estimators = 1000

In [41]:
clf2.fit(x_train_res, y_train_res)

      Iter       Train Loss   Remaining Time 
       301           0.0462          192.44m
       302           0.0461          191.94m
       303           0.0459          195.20m
       304           0.0458          195.01m
       305           0.0457          194.18m
       306           0.0455          194.00m
       307           0.0455          192.93m
       308           0.0454          192.39m
       309           0.0454          191.18m
       310           0.0452          191.47m
       311           0.0452          190.69m
       312           0.0451          190.93m
       313           0.0449          190.65m
       314           0.0448          190.39m
       315           0.0447          190.12m
       316           0.0445          190.17m
       317           0.0442          190.43m
       318           0.0441          190.12m
       319           0.0440          190.08m
       320           0.0437          190.10m
       321           0.0436          190.12m
       32

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=2, warm_start=True)

**Save model**

In [42]:
import pickle

In [43]:
import time

model_filename = model_id  + " [" + time.strftime("%Y%m%d %H%M") + "]"

In [54]:
save_file = open(NOTEBOOK_PATH + "Models/" + model_filename + ".pkl", "wb")
saved_model = pickle.dump(clf2, save_file)
save_file.close()

**Test model**

In [50]:
pred = clf2.predict(x_val)

In [51]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_val.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,111931,944,112875
Benign,11,453263,453274
All,111942,454207,566149


# F1 Score

In [45]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [52]:
pred = clf2.predict(x_test)
y_test_npy = y_test.to_numpy().ravel()

precision = precision_score(y_test_npy, pred)
print("Precision:", precision)

recall = recall_score(y_test_npy, pred)
print("Recall:", recall)

f1 = f1_score(y_test_npy, pred)
print("F1:", f1)

Precision: 0.9921674508126604
Recall: 0.9998742804802485
F1: 0.9960059575191317


In [53]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_test.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,111345,879,112224
Benign,14,453911,453925
All,111359,454790,566149


# Log results

In [72]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()