# Analysis of Network Traffic for Intrusion Detection

This notebook is dedicated to analyzing network traffic data to detect potential intrusions. The analysis includes data preprocessing, feature exploration, and the application of machine learning models to classify network behavior as normal or suspicious. We used CNN, LSTM and GRU.

## Data Loading

In this section, we load the network traffic dataset from its source. The dataset includes various features related to network activity, such as source IP, destination IP, packet sizes, and timestamps. Understanding the structure of this data is crucial for our analysis and subsequent feature engineering.

In [12]:
import time
import json
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

from src.data.dataset_info import datasets
from src.models import MyCNN, MyLSTM, MyGRU
# from src.models.dense_nn import  MyDenseNN

#specifying main configuration of the experiment
multi_class = True
with_network_features = True

with_sort_timestamp = True
sequence_length = 3
with_cross_validation = True
cross_validation_splits_num = 5

# choosing the dataset
dataset = datasets[0]
name = dataset.name
print("dataset: {}".format(name))
#path = "./datasets/preprocessed/{}.pkl".format(name)
####path 
path = "datasets\\preprocessed\\{}.pkl".format(name)
# graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)

# loading the dataframe
df = pd.read_pickle(path)

dataset: cic_ton_iot


In [13]:

print(df[dataset.class_col].unique())
df[dataset.class_col].value_counts()

['Benign' 'mitm' 'scanning' 'dos' 'ddos' 'injection' 'password' 'backdoor'
 'ransomware' 'xss']


Attack
Benign        2514059
xss           2149308
password       340208
injection      277696
scanning        36205
backdoor        27145
ransomware       5098
mitm              517
ddos              202
dos               145
Name: count, dtype: int64

In [14]:
# the input dimension of the training set
input_dim = df.shape[1] - len(dataset.drop_columns) - len(dataset.weak_columns) - 1  # for the label_column
print(input_dim)
if not with_network_features:
    input_dim = input_dim - len(dataset.network_features)
    
# specifying the number of classes, since it is different from one dataset to another and also if binary or multi-class classification
num_classes = 2
if multi_class:
    num_classes = len(df["Attack"].unique())

num_epochs = 30
    
dropped_columns = dataset.drop_columns
dataset_name = dataset.name
print(input_dim)
print(num_classes)
print(len(dropped_columns))
print(len(dataset.weak_columns))
print(df.shape[1])
print(dataset_name)


59
59
10
7
39
106
cic_ton_iot


In [15]:
print(input_dim)
print(dataset_name)
print(num_classes)
print(multi_class)
print(len(dataset.network_features))

59
cic_ton_iot
10
True
22


## Models intialization

In [16]:
#from importlib import reload
#import src.models
#reload(src.models)
#from src.models import MyCNN
nf = []
if with_network_features:
    nf = dataset.network_features

models = [
    MyCNN(
        input_dim=input_dim,
        dataset_name=dataset_name,
        num_classes=num_classes,
        multi_class=multi_class,
        network_features=nf,
        epochs=num_epochs,
        batch_size=256,
        early_stop_patience=10,
    ),
    # MyLSTM(
    #     sequence_length=sequence_length,
    #     input_dim=input_dim,
    #     dataset_name=dataset_name,
    #     num_classes=num_classes,
    #     multi_class=multi_class,
    #     network_features=nf,
    #     use_generator=True,
    #     epochs=num_epochs,
    #     batch_size=256,,
        # early_stop_patience=10,
    # ),
    # MyGRU(
    #     sequence_length=sequence_length,
    #     input_dim=input_dim,
    #     dataset_name=dataset_name,
    #     num_classes=num_classes,
    #     multi_class=multi_class,
    #     network_features=nf,
    #     use_generator=True,
    #     epochs=num_epochs,
    #     batch_size=256,,
        # early_stop_patience=10,
    # )
]




In [17]:
results = {}  # a dictionary that will contain all the options and results of models
# add all options to the results dictionary, to know what options selected for obtained results
results["configuration"] = "stratified k-fold cross validation - manual sequences"
results["multi_class"] = multi_class
results["with_sort_timestamp"] = with_sort_timestamp
results["sequence_length"] = sequence_length
results["with_cross_validation"] = with_cross_validation
results["cross_validation_splits_num"] = cross_validation_splits_num
results["with_network_features"] = with_network_features
results["network_features"] = dataset.cn_measures

results["dataset_name"] = dataset_name
results["input_dim"] = input_dim
results["dropped_columns"] = dropped_columns
results["num_dropped_columns"] = len(dropped_columns)

results["models"] = {}
results["average_acc"] = {}
results["average"] = {}

In [18]:
df[dataset.timestamp_col]

0          25/04/2019 05:18:52 pm
1          25/04/2019 05:18:49 pm
2          25/04/2019 05:18:37 pm
3          25/04/2019 05:18:42 pm
4          25/04/2019 05:18:42 pm
                    ...          
5351755    25/04/2019 04:34:34 pm
5351756    25/04/2019 04:30:56 pm
5351757    25/04/2019 04:48:39 pm
5351758    25/04/2019 05:01:42 pm
5351759    25/04/2019 04:09:55 pm
Name: Timestamp, Length: 5350583, dtype: object

In [19]:
df.columns = df.columns.str.strip()

## Data Preprocessing

In [20]:
if with_sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace= True)
labels_names = {0: "benign", 1: "attack"}
print(f"==>> labels_names: {labels_names}")
if multi_class:
    fac = pd.factorize(df[dataset.class_col])
    labels_names = {index: value for index, value in enumerate(fac[1])}
    print(f"==>> labels_names: {labels_names}")
    df[dataset.label_col] = fac[0]  # type: ignore

print()
df.drop(dataset.drop_columns, axis=1, inplace=True)
df.drop(dataset.weak_columns, axis=1, inplace=True)

#if not with_network_features:
 #   df = df.drop(dataset.network_features, axis=1)
if not with_network_features:
    df = df.drop([col for col in dataset.network_features if col in df.columns], axis=1)
df.reset_index(drop=True, inplace=True)
print(f"==>> labels_names: {labels_names}")

==>> labels_names: {0: 'benign', 1: 'attack'}
==>> labels_names: {0: 'Benign', 1: 'xss', 2: 'password', 3: 'scanning', 4: 'injection', 5: 'ransomware', 6: 'backdoor', 7: 'mitm', 8: 'ddos', 9: 'dos'}

==>> labels_names: {0: 'Benign', 1: 'xss', 2: 'password', 3: 'scanning', 4: 'injection', 5: 'ransomware', 6: 'backdoor', 7: 'mitm', 8: 'ddos', 9: 'dos'}


In [11]:
print("Columns to be dropped:", dataset.drop_columns)
print("Weak columns to be dropped:", dataset.weak_columns)
print("Remaining columns after dropping:", df.columns)

Columns to be dropped: ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Src Port', 'Dst Port', 'Attack']
Weak columns to be dropped: ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Subflow Bwd Pkts', 'Flow IAT Mean', 'Fwd Pkt Len Max', 'Flow IAT Max', 'Active Std', 'Bwd Header Len', 'Tot Bwd Pkts', 'Bwd Pkt Len Mean', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'CWE Flag Count', 'Bwd IAT Tot', 'Fwd IAT Mean', 'Fwd Pkt Len Std', 'Pkt Len Mean', 'Flow IAT Min', 'TotLen Bwd Pkts', 'Bwd Pkt Len Max', 'Pkt Len Var', 'FIN Flag Cnt', 'Bwd IAT Mean', 'Idle Mean', 'Pkt Len Max', 'Flow Pkts/s', 'Flow Duration', 'Pkt Len Std', 'Fwd IAT Tot', 'PSH Flag Cnt', 'Active Mean', 'Bwd Pkt Len Std', 'Fwd Pkt Len Mean']
Remaining columns after dropping: Index(['Protocol', 'Tot Fwd Pkts', 'TotLen Fwd Pkts', 'Fwd Pkt Len Min',
       'Bwd Pkt Len Min', 'Flow Byts/s', 'Flow IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd

In [12]:
print(len(df.columns))
print("Columns to be dropped:", dataset.drop_columns)
print("Weak columns to be dropped:", dataset.weak_columns)
print("Existing columns in the DataFrame:", df.columns)

60
Columns to be dropped: ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Src Port', 'Dst Port', 'Attack']
Weak columns to be dropped: ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Subflow Bwd Pkts', 'Flow IAT Mean', 'Fwd Pkt Len Max', 'Flow IAT Max', 'Active Std', 'Bwd Header Len', 'Tot Bwd Pkts', 'Bwd Pkt Len Mean', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'CWE Flag Count', 'Bwd IAT Tot', 'Fwd IAT Mean', 'Fwd Pkt Len Std', 'Pkt Len Mean', 'Flow IAT Min', 'TotLen Bwd Pkts', 'Bwd Pkt Len Max', 'Pkt Len Var', 'FIN Flag Cnt', 'Bwd IAT Mean', 'Idle Mean', 'Pkt Len Max', 'Flow Pkts/s', 'Flow Duration', 'Pkt Len Std', 'Fwd IAT Tot', 'PSH Flag Cnt', 'Active Mean', 'Bwd Pkt Len Std', 'Fwd Pkt Len Mean']
Existing columns in the DataFrame: Index(['Protocol', 'Tot Fwd Pkts', 'TotLen Fwd Pkts', 'Fwd Pkt Len Min',
       'Bwd Pkt Len Min', 'Flow Byts/s', 'Flow IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 

In [15]:
labels = df['Label'].to_numpy()
df = df.drop([dataset.label_col], axis=1).to_numpy()


In [14]:
df["Label"].unique()

array([0, 1], dtype=int64)

## Time series Training

In [16]:

tscv = TimeSeriesSplit(n_splits=cross_validation_splits_num)
print(tscv)
print(df.shape)
print(df.shape[0])
i = 0
for train_index, test_index in tscv.split(df):
    training_labels = labels[train_index]
    print(f"==>> train_index: {train_index}")
    print(f"==>> training_labels: {training_labels.shape}")
    testing_labels = labels[test_index]
    print(f"==>> test_index: {test_index}")
    print(f"==>> testing_labels: {testing_labels.shape}")

    i += 1
    print("fold: {}".format(i))
    # print("train_index: {}".format(train_index))
    print("=====================================")
    print("=====================================")
    # print("fold: {}/{}".format(i, len(list_of_dfs)))
    print("fold: {}/{}".format(i, cross_validation_splits_num))

    for model in models:
        print("training: {}".format(model.model_name()))
        print("sequential: {}".format(model.sequential))

        training = df[train_index]
        testing = df[test_index]
        
        scaler = MinMaxScaler()
        training = scaler.fit_transform(training)
        testing = scaler.transform(testing)

        model.build()
        print("Shape of input data:", training.shape)
        print("Shape of input data:", training_labels.shape)
        model.train(training,
                    training_labels)  # type: ignore
        predictions, prediction_time = model.predict(
            testing)  # type: ignore
        model_name, scores, class_report = model.evaluate(  # type: ignore
            predictions,
            testing_labels,
            prediction_time
        )
        scores["fold"] = i
        if i == 1:
            results["models"][model_name] = {}
            results["models"][model_name]["scores"] = [scores]
            results["models"][model_name]["class_report"] = [class_report]
        else:
            results["models"][model_name]["scores"].append(scores)
            results["models"][model_name]["class_report"].append(
                class_report)
        # results[str(i) + model_name] = scores
        print("{}: {}".format(model_name, scores))

    for model in models:
        model_name = model.model_name()
        average_acc = 0
        average_recall = 0
        average_precision = 0
        average_f1s = 0
        average_FPR = 0
        average_FNR = 0
        for result in results["models"][model_name]["scores"]:  # type: ignore
            average_acc += result["accuracy"]
            average_recall += result["recall"]
            average_precision += result["precision"]
            average_f1s += result["f1s"]
            average_FPR += result["FPR"]
            average_FNR += result["FNR"]
        average_acc = average_acc / i
        average_recall = average_recall / i
        average_precision = average_precision / i
        average_f1s = average_f1s / i
        average_FPR = average_FPR / i
        average_FNR = average_FNR / i
        if i == 1:
            results["models"][model_name]["average"] = [
                {
                    "average_acc": average_acc,
                    "average_recall": average_recall,
                    "average_precision": average_precision,
                    "average_f1s": average_f1s,
                    "average_FPR": average_FPR,
                    "average_FNR": average_FNR,
                    "fold": i
                }
            ]
            results["average_acc"][model_name] = average_acc
            results["average"][model_name] = {
                "average_acc": average_acc,
                "average_recall": average_recall,
                "average_precision": average_precision,
                "average_f1s": average_f1s,
                "average_FPR": average_FPR,
                "average_FNR": average_FNR
            }
        else:
            results["models"][model_name]["average"].append(
                {
                    "average_acc": average_acc,
                    "average_recall": average_recall,
                    "average_precision": average_precision,
                    "average_f1s": average_f1s,
                    "average_FPR": average_FPR,
                    "average_FNR": average_FNR,
                    "fold": i
                })
            results["average_acc"][model_name] = average_acc
            results["average"][model_name] = {
                "average_acc": average_acc,
                "average_recall": average_recall,
                "average_precision": average_precision,
                "average_f1s": average_f1s,
                "average_FPR": average_FPR,
                "average_FNR": average_FNR
            }
        print("{} average accuracy: {}".format(model_name, average_acc))

results["endtime"] = time.strftime("%Y:%m:%d-%H:%M:%S")

print(f"==>> results: {results}")


TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
(5350583, 59)
5350583
==>> train_index: [     0      1      2 ... 891765 891766 891767]
==>> training_labels: (891768,)
==>> test_index: [ 891768  891769  891770 ... 1783528 1783529 1783530]
==>> testing_labels: (891763,)
fold: 1
fold: 1/5
training: cnn bc nf cnn-64-64
sequential: False


hereeeeeeeeee
59
not hereeeeeeeeeeeee
Shape of input data: (891768, 59)
Shape of input data: (891768,)
Epoch 1/30


Epoch 1: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-01-0.0210.hdf5
Epoch 2/30
   1/3484 [..............................] - ETA: 1:13 - loss: 0.0018 - accuracy: 1.0000

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-02-0.0160.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-03-0.0157.hdf5
Epoch 4/30
Epoch 4: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-04-0.0156.hdf5
Epoch 5/30
Epoch 5: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-05-0.0155.hdf5
Epoch 6/30
Epoch 6: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-06-0.0154.hdf5
Epoch 7/30
Epoch 7: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-07-0.0153.hdf5
Epoch 8/30
Epoch 8: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-08-0.0153.hdf5
Epoch 9/30
Epoch 9: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-09-0.0152.hdf5
Epoch 10/30
Epoch 10: saving model to ./models/weig

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-02-0.0630.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn bc nf cnn-64-64\weights-improvement-03-0.0615.hdf5
Epoch 4/30

KeyboardInterrupt: 

## Saving Results

In [None]:
# creating the directories if they don't exist
if not os.path.isdir('./results'):
    os.mkdir('./results')

if not os.path.isdir('./results/{}'.format(dataset_name)):
    os.mkdir('./results/{}'.format(dataset_name))

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

# saving the results to a file for future refernece
filename = ('./results/{}/{}.json'.format(dataset_name,
            time.strftime("%Y%m%d-%H%M%S")))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(results, cls=NumpyEncoder))
outfile.close()
