# CSE-CIC-IDS 2017 Ensembles - Voting Classifier

In [1]:
model_id = "voting1-rf-dnn-knn"

In [18]:
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf

import glob, pickle, time, keras

In [3]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [4]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_all_binary.csv")

# Split data into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [7]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [8]:
del x_valtest 
del y_valtest

# Ensemble Models
*Avengers, ensemble!*

In [9]:
from sklearn.ensemble import VotingClassifier

In [10]:
rf = pickle.load(open(NOTEBOOK_PATH + "Models/randomforest5-binary [20191027 1909].pkl", "rb"))

In [11]:
knn = pickle.load(open(NOTEBOOK_PATH + "Models/knn-distance-binary [20191027 2128].pkl", "rb"))

In [13]:
dnn = keras.models.load_model(NOTEBOOK_PATH + "Models/dnn7-autoencodershape/dnn(441).h5")

In [19]:
dnnsk = tf.keras.wrappers.scikit_learn.KerasClassifier(lambda: dnn)

In [34]:
class CustomVotingClassifier(object):
    """ Implements a voting classifier for pre-trained classifiers"""
    def __init__(self, estimators):
        if type(estimators) is list:
            self.names = None
            self.estimators = estimators
        elif type(estimators) is dict:
            self.names = list(estimators.keys())
            self.estimators = list(estimators.values())
        else:
            raise TypeError("Must pass a List or Dictionary")

    def predict(self, X):
        # get values
        Y = np.zeros([X.shape[0], len(self.estimators)], dtype=int)
        for i, clf in enumerate(self.estimators):
            if self.names is not None: print(f"Predicting with {self.names[i]}...")
            Y[:, i] = np.squeeze(clf.predict(X))
        # apply voting 
        y = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y[i] = np.argmax(np.bincount(Y[i,:]))
        return y

In [38]:
ensemble = CustomVotingClassifier({"Random Forest": rf, "Dense Neural Network": dnn, "K-Nearest Neighbors": knn})

**Validate Model**

In [36]:
pred = ensemble.predict(x_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   16.0s finished


In [37]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_val.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,111764,468,112232
Benign,178,453739,453917
All,111942,454207,566149


# Test Set

In [39]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [42]:
pred = ensemble.predict(x_test)
y_test_npy = y_test.to_numpy().ravel()

precision = precision_score(y_test_npy, pred)
print("Precision:", precision)

recall = recall_score(y_test_npy, pred)
print("Recall:", recall)

f1 = f1_score(y_test_npy, pred)
print("F1:", f1)

Predicting with Random Forest...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   16.2s finished


Predicting with Dense Neural Network...
Predicting with K-Nearest Neighbors...
Precision: 0.9959698364649197
Recall: 0.9986440251798238
F1: 0.9973051381733231


In [43]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_test.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,111208,450,111658
Benign,151,454340,454491
All,111359,454790,566149


# Log results

In [22]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()