# CSE-CIC-IDS 2017 Naive Bayes

In [1]:
model_id = "bagging2-naivebayes"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob

In [3]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [4]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_all_binary.csv")

# Split data into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [7]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [8]:
del x_train
del y_train
del x_valtest 
del y_valtest

In [9]:
print("\nValidation:")
print(y_val['IsAttack'].value_counts())
print("\nTest:")
print(y_test['IsAttack'].value_counts())


Validation:
0    454207
1    111942
Name: IsAttack, dtype: int64

Test:
0    454790
1    111359
Name: IsAttack, dtype: int64


# ADASYN

In [10]:
x_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_adasyn_binary.csv")

In [11]:
y_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_adasyn_binary.csv")['IsAttack']

# Train model

In [12]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.ensemble import BaggingClassifier

In [13]:
nb = GaussianNB()

In [14]:
bagclf = BaggingClassifier(nb, n_estimators=8100, n_jobs=1, verbose=3)

In [15]:
%%time
bagclf.fit(x_train_res, y_train_res)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 8100 for this parallel run (total 8100)...
Building estimator 2 of 8100 for this parallel run (total 8100)...
Building estimator 3 of 8100 for this parallel run (total 8100)...
Building estimator 4 of 8100 for this parallel run (total 8100)...
Building estimator 5 of 8100 for this parallel run (total 8100)...
Building estimator 6 of 8100 for this parallel run (total 8100)...
Building estimator 7 of 8100 for this parallel run (total 8100)...
Building estimator 8 of 8100 for this parallel run (total 8100)...
Building estimator 9 of 8100 for this parallel run (total 8100)...
Building estimator 10 of 8100 for this parallel run (total 8100)...
Building estimator 11 of 8100 for this parallel run (total 8100)...
Building estimator 12 of 8100 for this parallel run (total 8100)...
Building estimator 13 of 8100 for this parallel run (total 8100)...
Building estimator 14 of 8100 for this parallel run (total 8100)...
Building estimator 15 of 8100 for this parallel run (tota

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 822.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 822.0min finished


Wall time: 13h 42min 1s


BaggingClassifier(base_estimator=GaussianNB(priors=None, var_smoothing=1e-09),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=8100, n_jobs=1, oob_score=False,
                  random_state=None, verbose=3, warm_start=False)

**Save model**

In [16]:
import pickle

In [17]:
import time

model_filename = model_id  + " [" + time.strftime("%Y%m%d %H%M") + "]"

In [18]:
save_file = open(NOTEBOOK_PATH + "Models/" + model_filename + ".pkl", "wb")
saved_model = pickle.dump(bagclf, save_file)
save_file.close()

**Test model**

In [13]:
pred = bagclf.predict(x_val)

In [14]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_val.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,110121,386758,496879
Benign,1821,67449,69270
All,111942,454207,566149


# Test Set

In [19]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [20]:
pred = bagclf.predict(x_test)
y_test_npy = y_test.to_numpy().ravel()

precision = precision_score(y_test_npy, pred)
print("Precision:", precision)

recall = recall_score(y_test_npy, pred)
print("Recall:", recall)

f1 = f1_score(y_test_npy, pred)
print("F1:", f1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 138.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 138.8min finished


Precision: 0.22036438093858865
Recall: 0.9838360617462442
F1: 0.3600769722495649


In [22]:
f1

0.3600769722495649

In [23]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_test.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,109559,387613,497172
Benign,1800,67177,68977
All,111359,454790,566149


# Log results

In [22]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()