<p style="font-family:Arial, serif;font-size:36px;font-style:normal;font-weight:bold;color:#0558ff;background-color:#ffffff;">Network traffic classification using CNN<br>tuning, training and testing the best model</p>

* [Loading and preparation of data](#section-one)
* [Creating model](#section-two)
* [Training the best model](#section-three)
* [Testing and evaluating the best model](#section-four)

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
np.random.seed(210)
import tensorflow as tf
import keras_tuner as kt
from keras.utils import np_utils
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Activation
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import optimizers
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
import seaborn as sn
import matplotlib
import matplotlib.pyplot as plt
from laplotter import LossAccPlotter
from tensorflow.keras.layers import BatchNormalization
import pickle as pk
import prettytable
from prettytable import PrettyTable
import multiprocessing as mp
from utils import dict_name2label

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [None]:
# preparation of necessary folders
folders = ['./models', './loss_acc_plots', './confusion_matrix']

for folder in folders:
    try:
        os.makedirs(folder)    
        print("Directory " , folder ,  " Created ")
    except FileExistsError:
        print("Directory " , folder ,  " already exists")   

#directory = './data' #for jupyter and colab notebooks
directory = '../input/pickles' #for kaggle notebook

In [None]:
lock = mp.Lock()
counter = mp.Value('i', 0)

# Loading and preparation of data
<a id="section-one"></a>

In [None]:
def gen_todo_list(directory):
    files = os.listdir(directory)
    todo_list = []
    for f in files:
      # Using only files listed in dict_name2label
      if f.split(".pickle")[0] in dict_name2label.keys():
        fullpath = os.path.join(directory, f)
        if os.path.isfile(fullpath):
          todo_list.append(fullpath)
    return todo_list

In [None]:
def load(filename):
    with open(filename, 'rb') as f:
        data = pk.load(f)
    return data

In [None]:
def load_data():
    max_data_nb = 10000
    todo_list = gen_todo_list(directory)
    ### ver 1 ###
    train_rate = 0.6
    val_rate = 0.2
    X_train = []
    y_train = []
    X_val = []
    y_val = []
    X_test = []
    y_test = []

    for counter, filename in enumerate(todo_list):
        (tmpX, tmpy) = load(filename)
        tmpX , tmpy = tmpX[:max_data_nb], tmpy[:max_data_nb]
        assert(len(tmpX) == len(tmpy))
        tmpX = processX(tmpX)
        train_num = int(len(tmpX) * train_rate)
        val_num = int(len(tmpX) * val_rate)
        X_train.extend(tmpX[:train_num])
        y_train.extend(tmpy[:train_num])
        X_val.extend(tmpX[train_num: train_num + val_num])
        y_val.extend(tmpy[train_num: train_num + val_num])
        X_test.extend(tmpX[train_num + val_num:])
        y_test.extend(tmpy[train_num + val_num:])
        print('\rLoading... {}/{}'.format(counter+1,len(todo_list)), end = '')
    print('\r{} Data loaded.               '.format(len(todo_list)))
    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
def processX(X):
    if True:
        X = np.array(X)
        lens = [len(x) for x in X] 
        maxlen = 1500
        tmpX = np.zeros((len(X), maxlen))
        mask = np.arange(maxlen) < np.array(lens)[:,None]
        tmpX[mask] = np.concatenate(X)
        return tmpX
    else:
        for i, x in enumerate(X):
            tmp_x = np.zeros((1500,))
            tmp_x[:len(x)] = x
            X[i] = tmp_x
        return X

In [None]:
# load data
x_train, y_train, x_val, y_val, x_test, y_test = load_data()

# formatting arrays according to the dimensions of the required space, and formatting the type of variable (for numpy)
x_train = np.expand_dims(x_train, axis=2).astype(np.float32)
x_val = np.expand_dims(x_val, axis=2).astype(np.float32)
x_test = np.expand_dims(x_test, axis=2).astype(np.float32)

# one-hot-encoding application names
encoder = LabelEncoder()
encoder.fit(y_train)
class_labels = encoder.classes_

# number of classes in model
nb_classes = len(class_labels)

# number of data samples per training, validation and testing class
a = PrettyTable(["", "Application", "For training", "For validation", "For testing", "Total"])
# Alignment in the table
a.align[""] = "r"
a.align["Application"] = "l"
a.align["For training"] = "r"
a.align["For validation"] = "r"
a.align["For testing"] = "r"
a.align["Total"] = "r"
a.padding_width = 1

n = 0
zt_uk =0
zv_uk = 0
zts_uk = 0
labels = []
for ime in class_labels:
    labels.append(ime)
    zt = y_train.count(ime) # for training
    zv = y_val.count(ime) # for validation
    zts = y_test.count(ime) # for testing
    zt_uk = zt_uk + zt
    zv_uk = zv_uk + zv
    zts_uk = zts_uk + zts
    uk = zt + zv + zts
    a.add_row([n, ime, zt, zv, zts, uk]) 
    n += 1
a.add_row([' ', 'All total', zt_uk, zv_uk, zts_uk, zt_uk + zv_uk + zts_uk]) 
list_of_table_lines = a.get_string().split('\n')
horizontal_line = list_of_table_lines[0]
result_lines = 1
print("\n".join(list_of_table_lines[:-(result_lines + 1)]))
print(horizontal_line)
print("\n".join(list_of_table_lines[-(result_lines + 1):]))

encoded_y_train = encoder.transform(y_train)
y_train = np_utils.to_categorical(encoded_y_train)

encoded_y_test = encoder.transform(y_test)
y_test = np_utils.to_categorical(encoded_y_test)

encoded_y_val = encoder.transform(y_val)
y_val = np_utils.to_categorical(encoded_y_val)

# Creating model
<a id="section-two"></a>

In [None]:
# Optimal model will be created by using of KerasTuner
input_size = 1500

NUM_EPOCHS = 10

def build_model(hp):
    # Initialize sequential API and start building model.
    model = Sequential()
    # Tune the droput.
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    dropout = hp.Choice('dropout', values=[0.2, 0.3, 0.5])
        
    # Tune the number and units in Conv1D.
    # Number of Units: 10 - 50 with step size of 10
    model.add(Conv1D(hp.Int("Conv1D_units_", min_value=50, max_value=100, step=2), 5, input_shape = (input_size,1), activation = 'relu'))
    model.add(Dropout(dropout))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    
    # Add dense layers
    denses = [200, 100, 50]
    for dense in denses:
        model.add(Dense(dense, activation = 'relu'))
        model.add(Dropout(dropout))
    
    # Add output layer.
    model.add(Dense(nb_classes, activation = 'softmax'))
    
    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly='true')
    
    return model

In [None]:
# Instantiate the tuner
tuner = kt.BayesianOptimization(build_model,
                     objective="val_accuracy",
                     directory="kt_dir",
                     project_name="kt_hyperband",
                     overwrite=True)

In [None]:
# Display search space summary
tuner.search_space_summary()

In [None]:
# This cell takes a long time to run when hyperband_iterations is large
# stop early: set up on 5 epochs with no improvement after which training will be stopped.

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(x_train, y_train, epochs=NUM_EPOCHS, validation_data=(x_val,y_val), callbacks=[stop_early], verbose=0)

In [None]:
# Display tuning results summary.
tuner.results_summary()

In [None]:
# Returns the best model(s), as determined by the tuner's objective.
best_hps=tuner.get_best_hyperparameters()[0]

# Prints best hyperparameters values
hps = tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values
print('HyperParameters: {}'.format(hps))

In [None]:
# Reinstantiate the (untrained) best model found during the search process.
h_model = tuner.hypermodel.build(best_hps)
h_model.summary()

# Training the best model
<a id="section-three"></a>

In [None]:
# Train the hypertuned model
print("Training CNN model:")

# location of the saved model
saved_model_file = 'models/cnn_model.h5'.format('conv1d-cnn')

# Keeping model in control points where function loss improves
checkpoint = ModelCheckpoint(saved_model_file, monitor='val_loss', save_best_only=True, verbose=1)
fit_history = h_model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=32, validation_data=(x_val,y_val),  
                    callbacks=[checkpoint])

print("Training of CNN model is over.\n")

In [None]:
plotter = LossAccPlotter(title = 'Performanses of loss and accuracy for CNN model',
                         save_to_filepath='loss_acc_plots/cnn.png',
                         show_regressions=True,
                         show_averages=False,
                         show_loss_plot=True,
                         show_acc_plot=True,
                         show_plot_window=False,
                         x_label="Epoch")

num_epochs = len(fit_history.history['accuracy'])

for epoch in range(NUM_EPOCHS):
    acc_train = fit_history.history['accuracy'][epoch]
    loss_train =fit_history.history['loss'][epoch]
    acc_val = fit_history.history['val_accuracy'][epoch]
    loss_val = fit_history.history['val_loss'][epoch]

    plotter.add_values(epoch, loss_train=loss_train, acc_train=acc_train, loss_val=loss_val, acc_val=acc_val, redraw=False)

plotter.redraw()
plotter.block()

# Testing and evaluating the best model
<a id="section-four"></a>

In [None]:
# Evaluate best hypertuned model
print("Performance report for CNN model:")

preds = h_model.predict(x_test, batch_size=32,  verbose=0)

y_true_labels = [np.argmax(t) for t in y_test]
y_preds_labels = [np.argmax(t) for t in preds]

class_metric_report = classification_report(y_true_labels, y_preds_labels, target_names=class_labels, digits=4)
print(class_metric_report)

In [None]:
def plot_confusion_matrix(y_labels, preds, class_labels):

    y_true_labels = [np.argmax(t) for t in y_labels]
    y_preds_labels = [np.argmax(t) for t in preds]

    cm = confusion_matrix(y_true_labels, y_preds_labels, normalize='true')

    df_cm = pd.DataFrame(cm)
    plt.figure(figsize=(20,15))
    plt.xlabel('Predicted')
    plt.ylabel('Real')
    fig = sn.heatmap(df_cm, cmap='coolwarm', xticklabels=class_labels, 
        yticklabels=class_labels[0], linewidths=.5, annot=True, fmt=".2f")
    plt.show()
    pdf_filename = 'confusion_matrix/cnn-' + 'confusion_matrix.pdf'
    fig.get_figure().savefig(pdf_filename, dpi=400)
    return pdf_filename

In [None]:
plot_confusion_matrix(y_test, preds, class_labels)