In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import tensorflow as tf
import pickle

## Load Data
- change column names
- drop null values

In [None]:
def csvToDataset(first, last):
    return pd.concat((pd.read_csv(f"/kaggle/input/cic-iot-2023/part-{str(index).zfill(5)}-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
                     for index in range(first,last)))

df = csvToDataset(0, 5)
# change column names
df.columns = ['_'.join(c.split(' ')).lower() for c in df.columns]

# drop NULLs & reset index
entries_before = len(df)
print(f"entries before: {len(df)}")
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
print(f"entries after: {len(df)}")
print(f"null values in dataframe: {(entries_before - len(df))*100 / entries_before}%")

df.head()

# Data preprocessing

## Encoding labels
Labels are string types. We have to encode it to integers. <br>
New dataframes will be created for training and testing the different models:
- Dataframe with binary labels (0: no-attack 1:attack)
- Dataframe of 7 categories
- Dataframe of 34 specific types of attacks

In [None]:
# label encoding for attack types
le_type = preprocessing.LabelEncoder()
le_type.fit(df.label)
label_en_type = le_type.transform(df.label)
attack_types = le_type.classes_
print('attack types:')
for i in range(len(attack_types)):
    print(f"{i}: {attack_types[i]}")
le_cat = preprocessing.LabelEncoder()

In [None]:
# extract attack category from label
category_dict = {
    'DDoS-ACK_Fragmentation' : 'DDoS',
    'DDoS-HTTP_Flood' : 'DDoS',
    'DDoS-ICMP_Flood': 'DDoS',
    'DDoS-PSHACK_Flood': 'DDoS',
    'DDoS-RSTFINFlood': 'DDoS',
    'DDoS-SYN_Flood': 'DDoS',
    'DDoS-SlowLoris': 'DDoS',
    'DDoS-SynonymousIP_Flood': 'DDoS',
    'DDoS-TCP_Flood': 'DDoS',
    'DDoS-UDP_Flood': 'DDoS',
    'DDoS-UDP_Fragmentation': 'DDoS',
    'DDoS-ICMP_Fragmentation' : 'DDoS',
    
    'DoS-HTTP_Flood' : 'DoS',
    'DoS-SYN_Flood' : 'DoS',
    'DoS-TCP_Flood' : 'DoS',
    'DoS-UDP_Flood' : 'DoS',
  
    'DictionaryBruteForce' : 'BruteForce',
    
    'MITM-ArpSpoofing' : 'Spoofing',
    'DNS_Spoofing' : 'Spoofing',
    
    'Recon-HostDiscovery' : 'Recon',
    'Recon-OSScan' : 'Recon',
    'Recon-PingSweep' : 'Recon',
    'Recon-PortScan' : 'Recon',
    'VulnerabilityScan' : 'Recon',
   
    'SqlInjection' : 'Web-based',
    'CommandInjection' : 'Web-based',
    'Backdoor_Malware' : 'Web-based',
    'Uploading_Attack' : 'Web-based',
    'XSS' : 'Web-based',
    'BrowserHijacking' : 'Web-based',
    
    'Mirai-greeth_flood' : 'Mirai',
    'Mirai-greip_flood' : 'Mirai',
    'Mirai-udpplain' : 'Mirai',
    
    'BenignTraffic' : 'Benign'
}

# label encoding for attack categories
df_label_cat = df.label.apply(lambda x: category_dict.get(x))
le_cat = preprocessing.LabelEncoder()
le_cat.fit(df_label_cat)
label_en_cat = le_cat.transform(df_label_cat)
attack_categories = le_cat.classes_
print('attack categories:')
for i in range(len(attack_categories)):
    print(f"{i}: {attack_categories[i]}")

In [None]:
# label encoding for attack - no-attack
label_en_bin = df.label.apply(lambda x: 0 if x=='BenignTraffic' else 1)

## Data presentation
- Attack types distribution
- Attack categories distribution

In [None]:
def percentage_above_bar_relative_to_xgroup(ax, total):
    all_heights = [[p.get_height() for p in bars] for bars in ax.containers]
    for bars in ax.containers:
        for i, p in enumerate(bars):
            percentage = f'{(p.get_height())*100/total :.1f}%'
            ax.annotate(percentage, (p.get_x() + p.get_width() / 2, p.get_height()), size=11, ha='center', va='bottom')

### Attack Types Distribution

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.title('Attack Types Distribution')
plot = sns.countplot(y=df.label)

### Attack Categories Distribution

In [None]:
sns.set(rc={'figure.figsize':(9.7,4.27)})
plt.title('Attack Categories Distribution')
ax = sns.countplot(x=df_label_cat)
percentage_above_bar_relative_to_xgroup(ax, len(df_label_cat))

In [None]:
sns.set(rc={'figure.figsize':(5,5)})
plt.title('0: Benign, 1: Abnormal')
ax = sns.countplot(x=label_en_bin)
percentage_above_bar_relative_to_xgroup(ax, len(label_en_bin))

## Split data and data scaling
Now we have to split the data for training and test. The usual is 80% training data and 20% test data split

In [None]:
# first we create the different datasets. (type, category and attack based)
df3 = pd.DataFrame(df.assign(label = label_en_type))
lb3 = preprocessing.LabelBinarizer()
lb3.fit(label_en_type)
y3w = lb3.transform(label_en_type)

df2 = pd.DataFrame(df.assign(label = label_en_cat))
lb2 = preprocessing.LabelBinarizer()
lb2.fit(label_en_cat)
y2w = lb2.transform(label_en_cat)


df1 = pd.DataFrame(df.assign(label = label_en_bin))
y1w = label_en_bin

In [None]:
neg, pos = np.bincount(df1['label'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive(attack packages): {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

### Dealing with imbalanced dataset
- **Undersampling**
- Oversampling (not implemented here)

Also useful in imbalanced dataset case is to **set initial bias** to the output layer: <br>
This way the model doesn't need to spend the first few epochs just learning that negative
examples are unlikely. <br>
It also makes it easier to read plots of the loss during training.

In [None]:
g = df1.groupby('label')
df1_balanced = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

In [None]:
sns.set(rc={'figure.figsize':(3,3)})
ax = sns.countplot(x=df1_balanced['label'])
percentage_above_bar_relative_to_xgroup(ax, len(df1_balanced))

In [None]:
g = df2.groupby('label')
df2_balanced = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

In [None]:
sns.set(rc={'figure.figsize':(7, 3)})
ax = sns.countplot(x=df2_balanced['label'])
percentage_above_bar_relative_to_xgroup(ax, len(df2_balanced))

In [None]:
g = df3.groupby('label')
df3_balanced = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

### Features filtering

### Split data to train and test

In [None]:
X1w = df1.drop('label', axis=1)
X2w = df2.drop('label', axis=1)
X3w = df3.drop('label', axis=1)

X1 = df1_balanced.drop('label', axis=1)
X2 = df2_balanced.drop('label', axis=1)
X3 = df3_balanced.drop('label', axis=1)
y1 = df1_balanced['label']
y2 = lb2.transform(df2_balanced['label'])
y3 = lb3.transform(df3_balanced['label'])

X1w_train, X1w_test, y1w_train, y1w_test = train_test_split(X1w, y1w,test_size=0.2,random_state=42)
X1w_train, X1w_val, y1w_train, y1w_val = train_test_split(X1w_train, y1w_train, test_size=0.2, random_state=42)

X2w_train, X2w_test, y2w_train, y2w_test = train_test_split(X2w, y2w, test_size=0.2, random_state=42)
X2w_train, X2w_val, y2w_train, y2w_val = train_test_split(X2w_train, y2w_train, test_size=0.2, random_state=42)

X3w_train, X3w_test, y3w_train, y3w_test = train_test_split(X3w, y3w, test_size=0.2, random_state=42)
X3w_train, X3w_val, y3w_train, y3w_val = train_test_split(X3w_train, y3w_train, test_size=0.2, random_state=42)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X1_train, X1_val, y1_train, y1_val = train_test_split(X1_train, y1_train, test_size=0.2, random_state=42)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2_train, y2_train, test_size=0.2, random_state=42)

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)
X3_train, X3_val, y3_train, y3_val = train_test_split(X3_train, y3_train, test_size=0.2, random_state=42)

### Data scaling
As we can see above, some values of features (e.g. rate, srate) have values close to zero, while other features (duration, header_length) they are in bigger scale. This will confuse the neural network. We assume that all features have the same importance, so we have to scale all the data from 0 to 1. 

In [None]:
scaler = StandardScaler()
X1w_train = scaler.fit_transform(X1w_train)
X1w_val = scaler.fit_transform(X1w_val)
X1w_test = scaler.transform(X1w_test)

scaler = StandardScaler()
X2w_train = scaler.fit_transform(X2w_train)
X2w_val = scaler.fit_transform(X2w_val)
X2w_test = scaler.transform(X2w_test)

scaler = StandardScaler()
X3w_train = scaler.fit_transform(X3w_train)
X3w_val = scaler.fit_transform(X3w_val)
X3w_test = scaler.transform(X3w_test)

scaler = StandardScaler()
X1_train = scaler.fit_transform(X1_train)
X1_val = scaler.transform(X1_val)
X1_test = scaler.transform(X1_test)

scaler = StandardScaler()
X2_train = scaler.fit_transform(X2_train)
X2_val = scaler.fit_transform(X2_val)
X2_test = scaler.transform(X2_test)

scaler = StandardScaler()
X3_train = scaler.fit_transform(X3_train)
X3_val = scaler.fit_transform(X3_val)
X3_test = scaler.transform(X3_test)

print(X1_train[0])

# Abnormal Detection
Finding the optimum deep learning model<br>
We first use the first dataframe, where the records are labeled as 
- 0: no-abnormal 
- 1: abnormal.

<b>This is a binary classification problem.<b>

## Neural Netowrk basic architecture - not optimal
First we try a basic neural network architecture 

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def basic_nn(X_train, y_train, epochs_, output_neurons=1, validation_data=None, output_bias=None):    
    tf.random.set_seed(42)
    
    output_layer = None
    if (output_neurons==1):
        output_layer = tf.keras.layers.Dense(output_neurons, activation='sigmoid', bias_initializer=output_bias)
    else:
        output_layer = tf.keras.layers.Dense(output_neurons, activation='softmax', bias_initializer=output_bias)

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        output_layer
    ])
    
    loss = None
    if (output_neurons > 1):
            loss=tf.keras.losses.categorical_crossentropy
    else: 
        loss=tf.keras.losses.binary_crossentropy
    
    model.compile(
        loss = loss,
        optimizer=tf.keras.optimizers.Adam(lr=0.03),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name='accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )
    early_stop = EarlyStopping(monitor = 'val_loss', mode='min', verbose = 1, patience = 25)
    model.fit(X_train, y_train, epochs=epochs_, validation_data=validation_data, callbacks=[early_stop])
    return model

In [None]:
def plot_metrics(history):
    mpl.rcParams['figure.figsize'] = (10, 8)
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    metrics = ['loss', 'accuracy', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()

In [None]:
def plot_cm(labels, predictions, threshold=0.5, multiclass=False, target_names=None):
    
    if not multiclass:
        cm = confusion_matrix(labels, predictions > threshold)
        plt.figure(figsize=(5,5))
        sns.heatmap(cm, annot=True, fmt="d")
        plt.title('Confusion matrix @{:.2f}'.format(threshold))
        plt.ylabel('Actual label')
        plt.xlabel('Predicted label')

        print(classification_report(labels, predictions > threshold, target_names=target_names))
        print()
        print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
        print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
        print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
        print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
        print('Total Fraudulent Transactions: ', np.sum(cm[1]))
        print()
        
    if multiclass:
        cm = confusion_matrix(labels, predictions)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt="d")
        plt.title('Confusion matrix for multi classification model')
        plt.ylabel('Actual label')
        plt.xlabel('Predicted label')
        print(classification_report(labels, predictions, target_names=target_names))

## Training with balanced dataset
- Advantages: non-biased model
- Disadvantages: Less data (because of undersampling)

In [None]:
from tensorflow.keras.initializers import Constant
#initial_bias = Constant(np.log([pos/neg]))
training1 = basic_nn(X1_train, y1_train, epochs_=600, validation_data=(X1_val, y1_val))#, output_bias = initial_bias)

In [None]:
plot_metrics(training1.history)

In [None]:
predictions = training1.predict(X1_test)
plot_cm(y1_test, predictions, 0.5, target_names=['Benign', 'Abnormal'])

## Training with Imbalanced Dataset
- Advantages: More data
- Disadvantages: Biased Model

In [None]:
# from tensorflow.keras.initializers import Constant
# #initial_bias = Constant(np.log([pos/neg]))
# training1w = basic_nn(X1w_train, y1w_train, epochs_=600, validation_data=(X1w_val, y1w_val))#, output_bias = initial_bias)

In [None]:
# plot_metrics(training1w.history)

In [None]:
# predictions = training1w.predict(X1w_test)
# plot_cm(y1w_test, predictions, 0.5, target_names=['Benign', 'Abnormal'])

# Attack Category Detection
Finding the optimum deep learning model<br>
We secondly use the second dataframe, where the records are labeled as 7 categories of attack traffic 

<b>This is a 7-class classification problem.<b>

In [None]:
training2 = basic_nn(X2_train, y2_train, 600, 8, validation_data=(X2_test, y2_test))

In [None]:
plot_metrics(training2.history)

In [None]:
predictions = training2.predict(X2_test)

In [None]:
plot_cm(lb2.inverse_transform(y2_test), lb2.inverse_transform(predictions), multiclass=True, target_names = le_cat.classes_)

We see that the results are not satisfying enough. We want the diagonal to be as light-colored as possible and the rest of the matrix black. <br>
So we will try another ANN architecture, with more hidden layers and more neurons

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def moderate_nn(X_train, y_train, epochs_, output_neurons=1, validation_data=None, output_bias=None):    
    tf.random.set_seed(42)

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(output_neurons, activation='sigmoid', bias_initializer=output_bias)
    ])
    
    loss = None
    if (output_neurons > 1):
            loss=tf.keras.losses.categorical_crossentropy
    else: 
        loss=tf.keras.losses.binary_crossentropy
    
    model.compile(
        loss = loss,
        optimizer=tf.keras.optimizers.Adam(lr=0.03),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name='accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )
    early_stop = EarlyStopping(monitor = 'val_loss', mode='min', verbose = 1, patience = 25)
    model.fit(X_train, y_train, epochs=epochs_, validation_data=validation_data, callbacks=[early_stop])
    return model

In [None]:
training2_moderate = basic_nn(X2_train, y2_train, 600, 8, validation_data=(X2_test, y2_test))

In [None]:
plot_metrics(training2_moderate.history)

In [None]:
predictions = training2_moderate.predict(X2_test)

In [None]:
plot_cm(lb2.inverse_transform(y2_test), lb2.inverse_transform(predictions), multiclass=True, target_names = le_cat.classes_)

# Attack Type Detection
At last use the third dataframe, where the records are labeled as 36 categories of attack traffic 

<b>This is a 34-class classification problem.<b>

In [None]:
training3 = basic_nn(X3_train, y3_train, 600, 34, validation_data=(X3_test, y3_test))

In [None]:
plot_metrics(training3.history)

In [None]:
predictions = training3.predict(X3_test)

In [None]:
plot_cm(lb3.inverse_transform(y3_test), lb3.inverse_transform(predictions), multiclass=True, target_names = le_type.classes_)

We see that the results are not satisfying enough. We want the diagonal to be as light-colored as possible and the rest of the matrix black. <br>
So we will try another ANN architecture, with more hidden layers and more neurons

In [None]:
training3_moderate = moderate_nn(X3_train, y3_train, 600, 34, validation_data=(X3_test, y3_test))

In [None]:
plot_metrics(training3_moderate.history)

In [None]:
predictions = training3_moderate.predict(X3_test)

In [None]:
plot_cm(lb3.inverse_transform(y3_test), lb3.inverse_transform(predictions), multiclass=True, target_names = le_type.classes_)

### Find the optimal ANN architecture
Still the results are not satisfying enough. We have to try more different ANN architectures. We can do this with an automated process:
<br>
**TO DO**

# Convolutional Networks - Type and Category Classification

## Convert data to images
In this section we will convert data rows to images and we are goinf to classify them  based on category, so we can see the differences of each. <br>

This won't help to making the cnn model, it's just for our better understanding of how can our data feed a CNN model, which works for images.

In [None]:
# first scale to 0,1 and then reshape the arrays (padding 3 last 0 so we can have a 7,7 ratio)
def helper(array):
    return  np.array(list(array)+[0,0,0]).reshape(7, 7)

scaler = MinMaxScaler()
X2_scaled = scaler.fit_transform(X2)

X2_images = np.array(list(map(helper, X2_scaled)))
print('Before scaling and reshaping')
print(X2.iloc[0])
print('After rescaling and reshaping')
print(X2_images[0])

In [None]:
def plot_images_category(images, category_name):
    len_ = len(images)
    
    fig, ax = plt.subplots(1, len_)
    plt.title(category_name)
    for n, image in enumerate(images):
        ax[n].title.set_text(category_name)
        ax[n].imshow(image)
        ax[n].grid(False)

In [None]:
for n, category_name in enumerate(le_cat.classes_):
    plot_images_category(X2_images[np.where(lb2.inverse_transform(y2) == n)[0]][0:4], category_name)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_images, y2, test_size=0.2, random_state=42)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2_train, y2_train, test_size=0.2, random_state=42)

X2_train = X2_train.reshape(len(X2_train), 7, 7, 1)
X2_val = X2_val.reshape(len(X2_val), 7, 7, 1)
X2_test = X2_test.reshape(len(X2_test), 7, 7, 1)

In [None]:
def basic_cnn(X_train, y_train, epochs_, output_neurons=1, validation_data=None, output_bias=None):    
    tf.random.set_seed(42)
    
    output_layer = None
    if (output_neurons==1):
        output_layer = tf.keras.layers.Dense(output_neurons, activation='sigmoid', bias_initializer=output_bias)
    else:
        output_layer = tf.keras.layers.Dense(output_neurons, activation='softmax', bias_initializer=output_bias)

    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(filters=32, kernel_size=(4,4), input_shape=(7,7,1), activation='relu'),
        tf.keras.layers.MaxPool2D(pool_size=(2,2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        output_layer
    ])
    
    loss = None
    if (output_neurons > 1):
            loss=tf.keras.losses.categorical_crossentropy
    else: 
        loss=tf.keras.losses.binary_crossentropy
    
    model.compile(
        loss = loss,
        optimizer=tf.keras.optimizers.Adam(lr=0.03),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name='accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )
    early_stop = EarlyStopping(monitor = 'val_loss', mode='min', verbose = 1, patience = 25)
    model.fit(X_train, y_train, epochs=epochs_, validation_data=validation_data, callbacks=[early_stop])
    return model

In [None]:
training2_cnn = basic_cnn(X2_train, y2_train, 600, 8, validation_data=(X2_test, y2_test))

In [None]:
plot_metrics(training2_cnn.history)

In [None]:
predictions = training2_cnn.predict(X2_test)

In [None]:
plot_cm(lb2.inverse_transform(y2_test), lb2.inverse_transform(predictions), multiclass=True, target_names = le_cat.classes_)