# CSE-CIC-IDS 2017

In [2]:
model_id = "boosting1-dnnae-100e12m"

In [3]:
import numpy as np
np.random.seed(42)
np.set_printoptions(suppress=True)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tqdm import tqdm

import glob, time, os, pickle

import keras

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [4]:
NOTEBOOK_PATH = "C:/Users/Xetrov/Desktop/SciFair20/Code/"

In [5]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

In [6]:
y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_all_binary.csv")

# Split data into train and test

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_valtest, y_train, y_valtest = train_test_split(x_scaled, y_df_enc, test_size = 0.4, random_state = 42)

In [9]:
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, random_state = 42)

In [10]:
del x_train
del y_train
del x_valtest 
del y_valtest

# Oversampling


In [11]:
x_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_adasyn_binary.csv")

In [12]:
y_train_res = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_adasyn_binary.csv")['IsAttack']

# Train model

In [13]:
from keras.models import Model, Sequential
from keras.layers import Dense, BatchNormalization, Dropout, InputLayer
from keras.optimizers import SGD, Adam

In [14]:
def addDenseBlock(model, units, **params):
    model.add(Dense(units=units, activation=params['hidden_activation'], input_dim=x_train_res.shape[1]))
    if params['batch_normalization']: model.add(BatchNormalization())
    if params['dropout_rate'] > 0: model.add(Dropout(params['dropout_rate']))

def createModel(**in_params):
    """
    Supported parameters:
    batch_normalization - True or False
    dropout_rate - 0 to 1
    learning_rate - float
    hidden_activation - string
    final_activation - string
    """
    
    model = Sequential()

    # Set default values
    params = {
        'batch_normalization': True,
        'dropout_rate': 0,
        'learning_rate': 0.001,
        'hidden_activation': 'relu',
        'final_activation': 'sigmoid'
    }
    
    # Replace defaults with specified parameters
    for param in in_params:
        params[param] = in_params[param]    

    addDenseBlock(model, 256, **params)
    addDenseBlock(model, 128, **params)
    addDenseBlock(model, 64, **params)
    addDenseBlock(model, 32, **params)
    addDenseBlock(model, 64, **params)
    addDenseBlock(model, 128, **params)
    addDenseBlock(model, 256, **params)

    model.add(Dense(units=1, activation=params['final_activation']))

    # optim = SGD(lr=0.001, nesterov=True)
    optim = Adam(lr=params['learning_rate'])

    model.compile(loss='binary_crossentropy', optimizer=optim, metrics=['accuracy'])
    
#     print(params)
    
    return model

### Callbacks

In [15]:
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

def timestamp():
    return time.strftime("%m-%d-%Y, %I%M%p")

# csv_callback = CSVLogger(NOTEBOOK_PATH + 'Loss Logs/%s (%s)' % (model_id, timestamp()), append=True)
# early_stop = EarlyStopping(monitor = 'val_loss', min_delta=0.0001, patience=5)

savedir = NOTEBOOK_PATH + "Models/%s/" % model_id
if not os.path.exists(savedir):
    os.makedirs(savedir)
    print("Created " + savedir)
else:
    print("Using " + savedir)

Created C:/Users/Xetrov/Desktop/SciFair20/Code/Models/boosting1-dnnae-100e12m/


### Training

In [16]:
import itertools

trainEpochs = 100
numModels = 12

parameters = {'batch_size': 1000, 'batch_normalization':True,'dropout_rate':0, 'final_activation':'sigmoid'}

In [17]:
%%time
print(model_id)
model = createModel(**parameters)
model.summary()
# model_checkpoint = ModelCheckpoint(NOTEBOOK_PATH + "Models/%s/dnn({epoch}).h5" % (model_id), monitor='val_loss')

boosting1-dnnae-100e12m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               17664     
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
batch_normalization_2 (Batch (None, 128)               512       
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
batch_normalization_3 (Batch (None, 64)                256       
_________________________________________________________________
dense_4 (Dense)              (None, 32)             

In [18]:
modelsk = tf.keras.wrappers.scikit_learn.KerasClassifier(lambda: model, epochs=trainEpochs, batch_size=parameters['batch_size'], verbose=2)

In [19]:
from sklearn.ensemble import AdaBoostClassifier

In [22]:
clf = AdaBoostClassifier(modelsk, n_estimators=numModels, random_state=42)

In [23]:
clf.fit(x_train_res, y_train_res)

ValueError: KerasClassifier doesn't support sample_weight.

# Save model

In [28]:
model_id

'bagging1-dnnae-10e90m'

In [29]:
for i in range(len(bagclf.estimators_)):
    bagclf.estimators_[i].model.save(f"{NOTEBOOK_PATH}Models/{model_id}/dnn({i}).h5")

# View confusion matrix

In [34]:
bagclf.n_jobs = 1

In [35]:
print("Predicting...")
pred = bagclf.predict(x_val)

print("Plotting...")
pred_series = pd.Series(pred.round().astype('int').ravel(), name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_val.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
print("Done!")
matrix

Predicting...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min finished


Plotting...
Done!


Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,110899,1257,112156
Benign,1043,452950,453993
All,111942,454207,566149


# F1 Score

In [36]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [37]:
pred = bagclf.predict(x_test).round().astype('int').ravel()
y_test_npy = y_test.to_numpy().ravel()

precision = precision_score(y_test_npy, pred)
print("Precision:", precision)

recall = recall_score(y_test_npy, pred)
print("Recall:", recall)

f1 = f1_score(y_test_npy, pred)
print("F1:", f1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min finished


Precision: 0.9891849234604658
Recall: 0.9905351161558563
F1: 0.9898595593843945


In [38]:
pred_series = pd.Series(pred, name="Pred").replace({0: 'Benign', 1: 'Attack'})
y_series = pd.Series(y_test.to_numpy().ravel(), name="Actual").replace({0: 'Benign', 1: 'Attack'})

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,Attack,Benign,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attack,110305,1206,111511
Benign,1054,453584,454638
All,111359,454790,566149
