In [None]:
# Libary
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import gdown
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, classification_report
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
import zipfile

# Ignore all warnings
warnings.filterwarnings("ignore")

#!pip install scikit-learn-intelex -q --progress-bar off
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)




---
Dataset preparation


In [None]:
## Download dataset

url = 'https://drive.google.com/uc?id=1iUC1Pv-1JfYWUKMGZQ2xSOk0nYAM2f6i'

# Path where you want to save the downloaded file
output = 'UNSW_NB15.zip'

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1iUC1Pv-1JfYWUKMGZQ2xSOk0nYAM2f6i
From (redirected): https://drive.google.com/uc?id=1iUC1Pv-1JfYWUKMGZQ2xSOk0nYAM2f6i&confirm=t&uuid=8ada942f-abdf-4c13-ab16-88606c471c30
To: /content/UNSW_NB15.zip
100%|██████████| 156M/156M [00:02<00:00, 54.8MB/s]


'UNSW_NB15.zip'

In [None]:
# Unzip dataset
with zipfile.ZipFile('UNSW_NB15.zip', 'r') as zip_ref:
  zip_ref.extractall()

In [None]:
## Load dataset for learning
# Encode problem
import chardet
with open('NUSW-NB15_features.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

# Get feature
cols = list(pd.read_csv('NUSW-NB15_features.csv', encoding=encoding)['Name'])

file_paths = ['UNSW-NB15_1.csv','UNSW-NB15_2.csv','UNSW-NB15_3.csv','UNSW-NB15_4.csv']

# Read each CSV file into separate DataFrames
dfs = [pd.read_csv(file, names=cols) for file in file_paths]

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)
data['attack_cat'] = data['attack_cat'].str.strip().str.replace('Backdoors', 'Backdoor')
data['attack_cat'] = data['attack_cat'].fillna(value='Normal')
data = data.drop_duplicates(ignore_index=True)
data = data.fillna(0)

In [None]:
def balance_class(df, cls_col, cls, cls_size):
    resampled_dfs = [df[df[cls_col] != cls]]
    cls_df = df[df[cls_col] == cls]
    current_class_size = len(cls_df)

    if current_class_size > cls_size:
        # Undersample: Reduce the number of samples
        cls_df_resampled = cls_df.sample(cls_size, random_state=42)
    elif current_class_size < cls_size:
        # Oversample: Increase the number of samples
        cls_df_resampled = cls_df.sample(cls_size, replace=True, random_state=42)
    else:
        cls_df_resampled = cls_df

    resampled_dfs.append(cls_df_resampled)

    return pd.concat(resampled_dfs).reset_index(drop=True)

In [None]:
data = balance_class(data, 'attack_cat', 'Analysis', 677)
data = balance_class(data, 'attack_cat', 'Backdoor', 577)
data = balance_class(data, 'attack_cat', 'DoS', 4089)
data = balance_class(data, 'attack_cat', 'Exploits', 7061)
data = balance_class(data, 'attack_cat', 'Fuzzers', 12062)
data = balance_class(data, 'attack_cat', 'Generic', 5016)
data = balance_class(data, 'attack_cat', 'Normal', 31395)
data = balance_class(data, 'attack_cat', 'Reconnaissance', 1695)
data = balance_class(data, 'attack_cat', 'Shellcode', 378)
data = balance_class(data, 'attack_cat', 'Worms', 44)

In [None]:
# Number of features
print('Number of features: ',data.shape[1])

Number of features:  49


In [None]:
# Record per class
print('Record per class:\n',data.groupby('attack_cat').size())
print('\nSum:\t\t',data['attack_cat'].size)

Record per class:
 attack_cat
Analysis            677
Backdoor            577
DoS                4089
Exploits           7061
Fuzzers           12062
Generic            5016
Normal            31395
Reconnaissance     1695
Shellcode           378
Worms                44
dtype: int64

Sum:		 62994





Dataset preparation


---






---

Data preprocessing

In [None]:
# Encode categorical variables into numeric values
labels = data['attack_cat'].unique()
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = data[column].astype(str)
    data[column] = label_encoders[column].fit_transform(data[column])

In [None]:
X = data.drop(['attack_cat', 'Label'],axis=1)
y = data['attack_cat']
y = label_encoders['attack_cat'].inverse_transform(y)

In [None]:
print('Number of train features: ',X.shape[1])

Number of train features:  47


In [None]:
## Scaling and standardlize
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)


Data preprocessing


---





---

Classification full feature

In [None]:
## Train classifier
clf = KNeighborsClassifier(n_neighbors=9, n_jobs=-1)

In [None]:
## Train classifier
kf = KFold(shuffle=True)
Y_test = []
Y_pred = []
# Train the SVM model
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test.extend(y_test)
    Y_pred.extend(y_pred)

report = classification_report(Y_test, Y_pred, output_dict=True)

In [None]:
from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:',report['accuracy'],'\tRecall:',report['macro avg']['recall'])

+----------------+-------------+-----------+------------+-----------+
| Class          |   Precision |    Recall |   F1-Score |   Support |
| Analysis       |   0.116719  | 0.109306  |  0.112891  |       677 |
+----------------+-------------+-----------+------------+-----------+
| Backdoor       |   0.0538117 | 0.0415945 |  0.0469208 |       577 |
+----------------+-------------+-----------+------------+-----------+
| DoS            |   0.318689  | 0.328198  |  0.323373  |      4089 |
+----------------+-------------+-----------+------------+-----------+
| Exploits       |   0.666097  | 0.71732   |  0.69076   |      7061 |
+----------------+-------------+-----------+------------+-----------+
| Fuzzers        |   0.776846  | 0.891229  |  0.830116  |     12062 |
+----------------+-------------+-----------+------------+-----------+
| Generic        |   0.968     | 0.796053  |  0.873646  |      5016 |
+----------------+-------------+-----------+------------+-----------+
| Normal         |  

In [None]:
# Create an array of accuracy values
accuracies = [report[x]['precision'] for x in labels]

## Export radar chart for report
import plotly.io
plotly.io.renderers.default = "colab"
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=accuracies,
  theta=labels,
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()





Classification full feature


---





---

Classification 10 feature

In [None]:
best_chromo = np.load('UNSW_best_chromo.npy')

In [None]:
Xfs = X.iloc[:,best_chromo]

In [None]:
## Train classifier
clf = KNeighborsClassifier(n_neighbors=9, n_jobs=-1)

In [None]:
## Train classifier
kf = KFold(shuffle=True)
Y_test_fs = []
Y_pred_fs = []
# Train the SVM model
for train_index, test_index in kf.split(Xfs):
    X_train, X_test = Xfs.iloc[train_index], Xfs.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test_fs.extend(y_test)
    Y_pred_fs.extend(y_pred)

report_fs = classification_report(Y_test_fs, Y_pred_fs, output_dict=True)

In [None]:
from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_fs.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:',report_fs['accuracy'],'\tRecall:',report_fs['macro avg']['recall'])

+----------------+-------------+-----------+------------+-----------+
| Class          |   Precision |    Recall |   F1-Score |   Support |
| Analysis       |   0.0866667 | 0.249631  | 0.128664   |       677 |
+----------------+-------------+-----------+------------+-----------+
| Backdoor       |   0.0651387 | 0.0935875 | 0.0768137  |       577 |
+----------------+-------------+-----------+------------+-----------+
| DoS            |   0.233838  | 0.291025  | 0.259316   |      4089 |
+----------------+-------------+-----------+------------+-----------+
| Exploits       |   0.608493  | 0.61082   | 0.609654   |      7061 |
+----------------+-------------+-----------+------------+-----------+
| Fuzzers        |   0.70073   | 0.763472  | 0.730757   |     12062 |
+----------------+-------------+-----------+------------+-----------+
| Generic        |   0.924636  | 0.645734  | 0.760418   |      5016 |
+----------------+-------------+-----------+------------+-----------+
| Normal         |  

In [None]:
# Create an array of accuracy values
accuracies = [report_fs[x]['precision'] for x in labels]

## Export radar chart for report
import plotly.io
plotly.io.renderers.default = "colab"
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=accuracies,
  theta=labels,
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()





Classification full feature


---

