# CSE-CIC-IDS 2017

In [1]:
model_id = "rnn1"

In [4]:
import numpy as np
np.random.seed(42)
np.set_printoptions(suppress=True)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()

import glob, time, os

import keras

In [5]:
NOTEBOOK_PATH = "D:/Delta Stuff/Scifair20/"

In [12]:
x_scaled = pd.read_csv(NOTEBOOK_PATH + "IDS2017/x_scaled_powertransform.csv")
y_df_enc = pd.read_csv(NOTEBOOK_PATH + "IDS2017/y_grouped_1henc.csv")

In [13]:
x_scaled_npy = np.expand_dims(x_scaled.to_numpy(), axis=-1)

In [14]:
x_scaled_npy.shape

(2830743, 68, 1)

# Split data into train and test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_val, y_train, y_val = train_test_split(x_scaled_npy, y_df_enc, test_size = 0.4, random_state = 42)

In [17]:
y_train.sum(axis=0)

BENIGN          1364100
Botnet             1185
Brute Force        8273
DoS/DDoS         228292
Infiltration         24
PortScan          95267
Web Attack         1304
dtype: int64

In [18]:
y_val.sum(axis=0)

BENIGN          908997
Botnet             781
Brute Force       5562
DoS/DDoS        152407
Infiltration        12
PortScan         63663
Web Attack         876
dtype: int64

# Train model

In [19]:
from keras.models import Model, Sequential
from keras.layers import Dense, LSTM, BatchNormalization, Dropout, InputLayer
from keras.optimizers import SGD, Adam

In [20]:
x_train.shape[1:]

(68, 1)

In [31]:
def addDenseBlock(model, units, **params):
    model.add(Dense(units=units, activation=params['hidden_activation'], input_dim=x_train.shape[1]))
    if params['batch_normalization']: model.add(BatchNormalization())
    if params['dropout_rate'] > 0: model.add(Dropout(params['dropout_rate']))

def addLSTMBlock(model, units, return_sequences=False, **params):
    model.add(LSTM(units=units, return_sequences=return_sequences, input_shape=x_train.shape[1:]))
    if params['batch_normalization']: model.add(BatchNormalization())
    if params['dropout_rate'] > 0: model.add(Dropout(params['dropout_rate']))
        
def createModel(**in_params):
    """
    Supported parameters:
    batch_normalization - True or False
    dropout_rate - 0 to 1
    num_units - integer
    learning_rate - float
    activation_function - string
    """
    
    model = Sequential()

    # Set default values
    params = {
        'batch_normalization': False,
        'dropout_rate': 0,
        'num_layers': 6,
        'num_units': 128,
        'learning_rate': 0.001,
        'hidden_activation': 'relu',
        'final_activation': 'softmax'
    }
    
    # Replace defaults with specified parameters
    for param in in_params:
        params[param] = in_params[param]    
    
    # InputLayer causes serialization issues
#     model.add( InputLayer(input_shape = (x_train.shape[1],) ) )
    
#     for i in range(params['num_layers']):
#         addDenseBlock(model, params['num_units'], **params)

    addLSTMBlock(model, 64, **params)

    model.add(Dense(units=y_train.shape[1], activation=params['final_activation']))

    # optim = SGD(lr=0.001, nesterov=True)
    optim = Adam(lr=params['learning_rate'])

    model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
    
#     print(params)
    
    return model

Ensure a GPU is available

In [26]:
import keras.backend as K

K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

### Callbacks

In [24]:
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

def timestamp():
    return time.strftime("%m-%d-%Y, %I%M%p")

csv_callback = CSVLogger(NOTEBOOK_PATH + 'Loss Logs/%s (%s)' % (model_id, timestamp()), append=True)
early_stop = EarlyStopping(monitor = 'val_loss', min_delta=0.0001, patience=5)

### Model Saving

In [25]:
savedir = NOTEBOOK_PATH + "Models/%s/" % model_id
if not os.path.exists(savedir):
    os.makedirs(savedir)
    print("Created " + savedir)
else:
    print("Using " + savedir)

Created D:/Delta Stuff/Scifair20/Models/rnn1/


### Training

In [27]:
import itertools

trainEpochs = 10

parameters = {'batch_size': 250, 'num_layers':5, 'num_units':256, 'dropout_rate':0}

In [32]:
%%time
print(model_id)
model = createModel(**parameters)
model.summary()
model_checkpoint = ModelCheckpoint(NOTEBOOK_PATH + "Models/%s/rnn({epoch}).h5" % (model_id), monitor='val_loss')
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=parameters['batch_size'], epochs=trainEpochs, verbose=2, callbacks=[csv_callback, model_checkpoint])

rnn1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 64)                16896     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 455       
Total params: 17,351
Trainable params: 17,351
Non-trainable params: 0
_________________________________________________________________
Train on 1698445 samples, validate on 1132298 samples
Epoch 1/10
 - 769s - loss: 0.1600 - acc: 0.9492 - val_loss: 0.0688 - val_acc: 0.9804
Epoch 2/10
 - 776s - loss: 0.0338 - acc: 0.9904 - val_loss: 0.0210 - val_acc: 0.9941
Epoch 3/10
 - 1059s - loss: 0.0201 - acc: 0.9944 - val_loss: 0.0150 - val_acc: 0.9959
Epoch 4/10
 - 1521s - loss: 0.0165 - acc: 0.9953 - val_loss: 0.0130 - val_acc: 0.9963
Epoch 5/10
 - 917s - loss: 0.0144 - acc: 0.9958 - val_loss: 0.0187 - val_acc: 0.9944
Epoch 6/10
 - 770s - loss: 0.0128 - acc: 0.9963 

<keras.callbacks.History at 0x149fa06ec18>

# F1 Score

In [33]:
from sklearn.metrics import f1_score

In [34]:
model_names = glob.glob(NOTEBOOK_PATH + f"Models/{model_id}/*.h5")
# print(model_names)
print(len(model_names))

10


In [35]:
f1_micro = np.zeros((len(model_names)))
f1_macro = np.zeros((len(model_names)))
for i in tqdm(range(len(model_names))):
    epoch_model = keras.models.load_model(model_names[i])
    pred = epoch_model.predict(x_val).argmax(axis=1)
    pred_f1 = np.zeros((len(x_val), y_val.shape[1]))
    for j in range(len(pred)):
        pred_f1[j,pred[j]] = 1
    f1_micro[i-1] = f1_score(y_val, pred_f1.astype('uint8'), average='micro')
    f1_macro[i-1] = f1_score(y_val, pred_f1.astype('uint8'), average='macro')

  'precision', 'predicted', average, warn_for)
 40%|██████████████████████████████▊                                              | 4/10 [1:31:45<2:17:29, 1374.84s/it]

KeyboardInterrupt: 

In [37]:
f1_macro

array([0.7748019 , 0.62315587, 0.63599698, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.5108828 ])

In [36]:
print("F1 Micro")
print(model_names[f1_micro.argmax()])
print(f1_micro.max())

print("F1 Macro")
print(model_names[f1_macro.argmax()])
print(f1_macro.max())

F1 Micro
D:/Delta Stuff/Scifair20/Models/rnn1\rnn(1).h5
0.9974900600371986
F1 Macro
D:/Delta Stuff/Scifair20/Models/rnn1\rnn(1).h5
0.7748019001909506


In [38]:
model = keras.models.load_model("D:/Delta Stuff/Scifair20/Models/rnn1/rnn(1).h5")
pred = model.predict(x_val)

pred_index = list(pred.argmax(1))
y_index = list(np.argmax(y_val.to_numpy(), axis=1))

atktypes = sorted(y_val.columns)
pred_series = pd.Series(pred_index, name="Pred").replace(dict(enumerate(atktypes)))
y_series = pd.Series(y_index, name="Actual").replace(dict(enumerate(atktypes)))

matrix = pd.crosstab(pred_series, y_series, margins=True)
matrix

Actual,BENIGN,Botnet,Brute Force,DoS/DDoS,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,904965,771,2774,13005,12,602,874,923003
Brute Force,120,0,2788,0,0,0,0,2908
DoS/DDoS,2331,0,0,139370,0,45,2,141748
PortScan,1581,10,0,32,0,63016,0,64639
All,908997,781,5562,152407,12,63663,876,1132298


In [49]:
# Dense neural network

Actual,BENIGN,Bot,Brute Force FTP,Brute Force SSH,DDoS,DoS,Heartbleed,Infiltration,PortScan,Web Attack,All
Pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BENIGN,226915,118,0,3,9,37,2,3,0,7,227094
Bot,0,76,0,0,0,0,0,0,0,0,76
Brute Force FTP,1,0,823,5,0,0,0,0,0,0,829
Brute Force SSH,12,0,1,563,0,0,0,0,0,0,576
DDoS,1,0,0,0,12842,0,0,0,0,0,12843
DoS,261,0,1,0,0,25189,0,0,6,2,25459
PortScan,91,0,0,0,0,0,0,0,15904,0,15995
Web Attack,13,0,0,0,0,0,0,0,0,190,203
All,227294,194,825,571,12851,25226,2,3,15910,199,283075


# Log results

In [72]:
model_log = open(NOTEBOOK_PATH + "model_log.txt", "a")

model_log.write("\n" + model_filename)
model_log.write("\n\tF1 Micro: " + str(f1_micro))
model_log.write("\n\tF1 Macro: " + str(f1_macro))

model_log.close()