# CSE-CIC-IDS 2017

In [1]:
model_id = "cnnfeatures1"

In [2]:
import numpy as np
np.random.seed(42)
np.set_printoptions(suppress=True)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import glob, time, os

import keras

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
NOTEBOOK_PATH = "D:/Delta Stuff/Scifair20/"

In [4]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")

In [5]:
y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_1henc.csv")

# Reshape into images

In [12]:
x_img = x_scaled.to_numpy().reshape((len(x_scaled), 4, 17))

In [33]:
x_img = np.expand_dims(x_img, -1)

# Split data into train and test

In [6]:
from sklearn.model_selection import train_test_split

In [34]:
x_train, x_val, y_train, y_val = train_test_split(x_img, y_df_enc, test_size = 0.1, random_state = 42)

In [16]:
y_train.sum(axis=0)

BENIGN             2045803
Bot                   1772
Brute Force FTP       7113
Brute Force SSH       5326
DDoS                115176
DoS                 227435
Heartbleed               9
Infiltration            33
PortScan            143020
Web Attack            1981
dtype: int64

In [17]:
y_val.sum(axis=0)

BENIGN             227294
Bot                   194
Brute Force FTP       825
Brute Force SSH       571
DDoS                12851
DoS                 25226
Heartbleed              2
Infiltration            3
PortScan            15910
Web Attack            199
dtype: int64

# Train model

In [44]:
from keras.models import Model, Sequential
from keras.layers import Dense, BatchNormalization, Dropout, InputLayer, Conv2D, Flatten
from keras.optimizers import SGD, Adam

In [45]:
def addDenseBlock(model, units, **params):
    model.add(Dense(units=units, activation=params['hidden_activation'], input_dim=x_train.shape[1]))
    if params['batch_normalization']: model.add(BatchNormalization())
    if params['dropout_rate'] > 0: model.add(Dropout(params['dropout_rate']))

def addConvBlock(model, units, **params):
    model.add(Conv2D(filters=units, kernel_size=params['kernel_size'], activation=params['hidden_activation'], padding=params['padding'],input_shape=(4,17,1)))
    if params['batch_normalization']: model.add(BatchNormalization())
    if params['dropout_rate'] > 0: model.add(Dropout(params['dropout_rate']))
        
def createModel(**in_params):
    """
    Supported parameters:
    batch_normalization - True or False
    dropout_rate - 0 to 1
    num_units - integer
    learning_rate - float
    activation_function - string
    """
    
    model = Sequential()

    # Set default values
    params = {
        'batch_normalization': False,
        'dropout_rate': 0,
        'num_layers': 6,
        'num_units': 128,
        'learning_rate': 0.001,
        'hidden_activation': 'relu',
        'final_activation': 'softmax',
        'kernel_size': (2,2),
        'padding':'same'
    }
    
    # Replace defaults with specified parameters
    for param in in_params:
        params[param] = in_params[param]    
    
    # InputLayer causes serialization issues
#     model.add( InputLayer(input_shape = (x_train.shape[1],) ) )
    
#     for i in range(params['num_layers']):
#         addDenseBlock(model, params['num_units'], **params)

    addConvBlock(model, 64, **params)
    addConvBlock(model, 128, **params)
    addConvBlock(model, 256, **params)
    addConvBlock(model, 128, **params)
    addConvBlock(model, 64, **params)
    
    model.add(Flatten())

    model.add(Dense(units=y_train.shape[1], activation=params['final_activation']))

    # optim = SGD(lr=0.001, nesterov=True)
    optim = Adam(lr=params['learning_rate'])

    model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
    
#     print(params)
    
    return model

Ensure a GPU is available

In [21]:
import keras.backend as K

K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

### Callbacks

In [22]:
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

def timestamp():
    return time.strftime("%m-%d-%Y, %I%M%p")

csv_callback = CSVLogger(NOTEBOOK_PATH + 'Loss Logs/%s (%s)' % (model_id, timestamp()), append=True)
early_stop = EarlyStopping(monitor = 'val_loss', min_delta=0.0001, patience=5)

### Model Saving

In [23]:
savedir = NOTEBOOK_PATH + "Models/%s/" % model_id
if not os.path.exists(savedir):
    os.makedirs(savedir)
    print("Created " + savedir)
else:
    print("Using " + savedir)

Created D:/Delta Stuff/Scifair20/Models/cnnfeatures1/


### Training

In [47]:
import itertools

trainEpochs = 10

parameters = {'batch_size': 250, 'num_layers':5, 'num_units':256, 'dropout_rate':0}

In [48]:
%%time
print(model_id)
model = createModel(**parameters)
model.summary()
model_checkpoint = ModelCheckpoint(NOTEBOOK_PATH + "Models/%s/cnn({epoch}).h5" % (model_id), monitor='val_loss')
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=parameters['batch_size'], epochs=trainEpochs, verbose=2, callbacks=[csv_callback, model_checkpoint])

cnnfeatures1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_21 (Conv2D)           (None, 4, 17, 64)         320       
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 4, 17, 128)        32896     
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 4, 17, 256)        131328    
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 4, 17, 128)        131200    
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 4, 17, 64)         32832     
_________________________________________________________________
flatten_2 (Flatten)          (None, 4352)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                43530   

<keras.callbacks.History at 0x21111bc6048>

In [43]:
y_train.shape

(2547668, 10)

# F1 Score

In [49]:
from sklearn.metrics import f1_score

In [50]:
model_names = glob.glob(NOTEBOOK_PATH + f"Models/{model_id}/*.h5")
# print(model_names)
print(len(model_names))

10


In [51]:
f1_micro = np.zeros((len(model_names)))
f1_macro = np.zeros((len(model_names)))
for i in tqdm(range(len(model_names))):
    epoch_model = keras.models.load_model(model_names[i])
    pred = epoch_model.predict(x_val).argmax(axis=1)
    pred_f1 = np.zeros((len(x_val), y_val.shape[1]))
    for j in range(len(pred)):
        pred_f1[j,pred[j]] = 1
    f1_micro[i-1] = f1_score(y_val, pred_f1.astype('uint8'), average='micro')
    f1_macro[i-1] = f1_score(y_val, pred_f1.astype('uint8'), average='macro')

  'precision', 'predicted', average, warn_for)
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:52<00:00, 23.23s/it]


In [52]:
print("F1 Micro")
print(model_names[f1_micro.argmax()])
print(f1_micro.max())

print("F1 Macro")
print(model_names[f1_macro.argmax()])
print(f1_macro.max())

F1 Micro
D:/Delta Stuff/Scifair20/Models/cnnfeatures1\cnn(7).h5
0.9984597721451912
F1 Macro
D:/Delta Stuff/Scifair20/Models/cnnfeatures1\cnn(1).h5
0.7622687713179819


In [53]:
model = keras.models.load_model("D:/Delta Stuff/Scifair20/Models/cnnfeatures1\cnn(1).h5")
pred = model.predict(x_val)

pred_index = list(pred.argmax(1))
y_index = list(np.argmax(y_val.to_numpy(), axis=1))

atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,BENIGN,Bot,Brute Force FTP,Brute Force SSH,DDoS,DoS,Heartbleed,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BENIGN,227032,127,1,28,10,510,2,3,9,6,227728
Bot,0,67,0,0,0,0,0,0,0,0,67
Brute Force FTP,0,0,822,1,0,0,0,0,0,0,823
Brute Force SSH,6,0,1,537,0,0,0,0,0,10,554
DDoS,1,0,0,0,12841,0,0,0,0,0,12842
DoS,159,0,1,5,0,24716,0,0,6,1,24888
PortScan,91,0,0,0,0,0,0,0,15895,0,15986
Web Attack,5,0,0,0,0,0,0,0,0,182,187
All,227294,194,825,571,12851,25226,2,3,15910,199,283075


In [56]:
model = keras.models.load_model("D:/Delta Stuff/Scifair20/Models/dnn1\dnn(17).h5")
pred = model.predict(x_val)

pred_index = list(pred.argmax(1))
y_index = list(np.argmax(y_val.to_numpy(), axis=1))

atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

ValueError: Error when checking input: expected dense_15_input to have 2 dimensions, but got array with shape (283075, 4, 17, 1)

# Log results

In [55]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_id)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()