In [None]:
# Libary
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
import gdown
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, classification_report
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
import zipfile

# Ignore all warnings
warnings.filterwarnings("ignore")

#!pip install scikit-learn-intelex -q --progress-bar off
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)






---


Dataset preparation




In [None]:
# Download dataset
url = 'https://drive.google.com/uc?id=1ELj3s17Zpy5KmMzGSNh8jDYFWJ909K5N'

# Path where you want to save the downloaded file
output = 'Bot_iot_0_05.zip'

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1ELj3s17Zpy5KmMzGSNh8jDYFWJ909K5N
From (redirected): https://drive.google.com/uc?id=1ELj3s17Zpy5KmMzGSNh8jDYFWJ909K5N&confirm=t&uuid=18a1c3d2-6a59-4663-8865-027e643c12b1
To: /content/Bot_iot_0_05.zip
100%|██████████| 57.2M/57.2M [00:01<00:00, 32.8MB/s]


'Bot_iot_0_05.zip'

In [None]:
# Unzip dataset
with zipfile.ZipFile('Bot_iot_0_05.zip', 'r') as zip_ref:
  zip_ref.extractall()

In [None]:
## Load dataset for learning
file_paths = ['reduced_data_{}.csv'.format(x) for x in range(1, 4+1)]

# Read each CSV file into separate DataFrames
dfs = [pd.read_csv(file) for file in file_paths]

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

data = data.drop(columns=['pkSeqID','stime','ltime','flgs_number','proto_number',
                          'saddr','sport','daddr','pkts','bytes','state_number',
                          'seq','dur','rate'])

In [None]:
def balance_class(df, cls_col, cls, cls_size):
    resampled_dfs = [df[df[cls_col] != cls]]
    cls_df = df[df[cls_col] == cls]
    current_class_size = len(cls_df)

    if current_class_size > cls_size:
        # Undersample: Reduce the number of samples
        cls_df_resampled = cls_df.sample(cls_size, random_state=42)
    elif current_class_size < cls_size:
        # Oversample: Increase the number of samples
        cls_df_resampled = cls_df.sample(cls_size, replace=True, random_state=42)
    else:
        cls_df_resampled = cls_df

    resampled_dfs.append(cls_df_resampled)

    return pd.concat(resampled_dfs).reset_index(drop=True)

In [None]:
data = balance_class(data, 'category', 'DDoS', 240000)
data = balance_class(data, 'category', 'DoS', 242788)
data = balance_class(data, 'category', 'Reconnaissance', 182166)
data = balance_class(data, 'category', 'Theft', 160)

In [None]:
# Number of features
print('Number of features: ',data.shape[1])

Number of features:  32


In [None]:
# Record per class
print('Record per class:\n',data.groupby('category').size())
print('\nSum:\t\t',data['category'].size)

Record per class:
 category
DDoS              240000
DoS               242788
Normal               477
Reconnaissance    182166
Theft                160
dtype: int64

Sum:		 665591





Dataset preparation


---






---

Data preprocessing

In [None]:
# Encode categorical variables into numeric values
labels = data['category'].unique()
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = data[column].astype(str)
    data[column] = label_encoders[column].fit_transform(data[column])

In [None]:
X = data.drop(columns=['attack', 'category', 'subcategory'])
y = label_encoders['category'].inverse_transform(data['category'])

In [None]:
print('Number of train features: ',X.shape[1])

Number of train features:  29


In [None]:
## Scaling and standardlize
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)


Data preprocessing


---





---

Classification full feature

In [None]:
# Create an SVM classifier with a linear kernel
clf = svm.LinearSVC()

In [None]:
## Train classifier
kf = KFold(shuffle=True)
Y_test = []
Y_pred = []
# Train the SVM model
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test.extend(y_test)
    Y_pred.extend(y_pred)

report = classification_report(Y_test, Y_pred, output_dict=True)

In [None]:
from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:',report['accuracy'],'\tRecall:',report['macro avg']['recall'])

+----------------+-------------+----------+------------+-----------+
| Class          |   Precision |   Recall |   F1-Score |   Support |
| DDoS           |    0.887121 | 0.868617 |  0.877771  |    240000 |
+----------------+-------------+----------+------------+-----------+
| DoS            |    0.878969 | 0.898623 |  0.888687  |    242788 |
+----------------+-------------+----------+------------+-----------+
| Normal         |    0.974432 | 0.719078 |  0.827503  |       477 |
+----------------+-------------+----------+------------+-----------+
| Reconnaissance |    0.94569  | 0.944781 |  0.945235  |    182166 |
+----------------+-------------+----------+------------+-----------+
| Theft          |    0.108108 | 0.025    |  0.0406091 |       160 |
+----------------+-------------+----------+------------+-----------+

Accuracy: 0.9000978078129062 	Recall: 0.6912197717889326


In [None]:
# Create an array of accuracy values
accuracies = [report[x]['precision'] for x in labels]

## Export radar chart for report
import plotly.io
plotly.io.renderers.default = "colab"
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=accuracies,
  theta=labels,
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()





Classification full feature


---





---

Classification 10 feature

In [None]:
best_chromo = np.load('BOT_IOT_005_best_chromo.npy')

In [None]:
Xfs = X.iloc[:,best_chromo]

In [None]:
# Create an SVM classifier with a linear kernel
clf = svm.LinearSVC()

In [None]:
## Train classifier
kf = KFold(shuffle=True)
Y_test_fs = []
Y_pred_fs = []
# Train the SVM model
for train_index, test_index in kf.split(Xfs):
    X_train, X_test = Xfs.iloc[train_index], Xfs.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test_fs.extend(y_test)
    Y_pred_fs.extend(y_pred)

report_fs = classification_report(Y_test_fs, Y_pred_fs, output_dict=True)

In [None]:
from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_fs.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:',report_fs['accuracy'],'\tRecall:',report_fs['macro avg']['recall'])

+----------------+-------------+----------+------------+-----------+
| Class          |   Precision |   Recall |   F1-Score |   Support |
| DDoS           |    0.76764  | 0.835675 |   0.800214 |    240000 |
+----------------+-------------+----------+------------+-----------+
| DoS            |    0.867045 | 0.723372 |   0.788719 |    242788 |
+----------------+-------------+----------+------------+-----------+
| Normal         |    0        | 0        |   0        |       477 |
+----------------+-------------+----------+------------+-----------+
| Reconnaissance |    0.831505 | 0.920957 |   0.873948 |    182166 |
+----------------+-------------+----------+------------+-----------+
| Theft          |    0        | 0        |   0        |       160 |
+----------------+-------------+----------+------------+-----------+

Accuracy: 0.8172511347058479 	Recall: 0.49600070807647156


In [None]:
# Create an array of accuracy values
accuracies = [report_fs[x]['precision'] for x in labels]

## Export radar chart for report
import plotly.io
plotly.io.renderers.default = "colab"
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=accuracies,
  theta=labels,
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()





Classification 10 features


---

