In [None]:
import matplotlib.pyplot as plt
import numpy as np
import gdown
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report
import warnings
import zipfile
from xgboost import XGBClassifier
import chardet
from tabulate import tabulate
import plotly.io
import plotly.graph_objects as go

# Ignore all warnings
warnings.filterwarnings("ignore")


In [None]:
# Download dataset
url = 'https://drive.google.com/uc?id=1iUC1Pv-1JfYWUKMGZQ2xSOk0nYAM2f6i'
output = 'UNSW_NB15.zip'
gdown.download(url, output, quiet=False)

# Unzip dataset
with zipfile.ZipFile('UNSW_NB15.zip', 'r') as zip_ref:
    zip_ref.extractall()


Downloading...
From (original): https://drive.google.com/uc?id=1iUC1Pv-1JfYWUKMGZQ2xSOk0nYAM2f6i
From (redirected): https://drive.google.com/uc?id=1iUC1Pv-1JfYWUKMGZQ2xSOk0nYAM2f6i&confirm=t&uuid=fcf5d761-fedf-40c8-be5e-2b3fb6a0d74b
To: /content/UNSW_NB15.zip
100%|██████████| 156M/156M [00:06<00:00, 22.9MB/s]


In [None]:
## Load dataset for learning
# Encode problem
import chardet
with open('NUSW-NB15_features.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

# Get feature
cols = list(pd.read_csv('NUSW-NB15_features.csv', encoding=encoding)['Name'])

file_paths = ['UNSW-NB15_1.csv','UNSW-NB15_2.csv','UNSW-NB15_3.csv','UNSW-NB15_4.csv']

# Read each CSV file into separate DataFrames
dfs = [pd.read_csv(file, names=cols) for file in file_paths]

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)
data['attack_cat'] = data['attack_cat'].str.strip().str.replace('Backdoors', 'Backdoor')
data['attack_cat'] = data['attack_cat'].fillna(value='Normal')
data = data.drop_duplicates(ignore_index=True)
data = data.fillna(0)

In [None]:
def balance_class(df, cls_col, cls, cls_size):
    resampled_dfs = [df[df[cls_col] != cls]]
    cls_df = df[df[cls_col] == cls]
    current_class_size = len(cls_df)

    if current_class_size > cls_size:
        # Undersample: Reduce the number of samples
        cls_df_resampled = cls_df.sample(cls_size, random_state=42)
    elif current_class_size < cls_size:
        # Oversample: Increase the number of samples
        cls_df_resampled = cls_df.sample(cls_size, replace=True, random_state=42)
    else:
        cls_df_resampled = cls_df

    resampled_dfs.append(cls_df_resampled)

    return pd.concat(resampled_dfs).reset_index(drop=True)

data = balance_class(data, 'attack_cat', 'Analysis', 677)
data = balance_class(data, 'attack_cat', 'Backdoor', 577)
data = balance_class(data, 'attack_cat', 'DoS', 4089)
data = balance_class(data, 'attack_cat', 'Exploits', 7061)
data = balance_class(data, 'attack_cat', 'Fuzzers', 12062)
data = balance_class(data, 'attack_cat', 'Generic', 5016)
data = balance_class(data, 'attack_cat', 'Normal', 31395)
data = balance_class(data, 'attack_cat', 'Reconnaissance', 1695)
data = balance_class(data, 'attack_cat', 'Shellcode', 378)
data = balance_class(data, 'attack_cat', 'Worms', 44)


In [None]:
print('Number of train features: ',X.shape[1])

Number of train features:  47


In [None]:
# Record per class
print('Record per class:\n',data.groupby('attack_cat').size())
print('\nSum:\t\t',data['attack_cat'].size)

Record per class:
 attack_cat
Analysis            677
Backdoor            577
DoS                4089
Exploits           7061
Fuzzers           12062
Generic            5016
Normal            31395
Reconnaissance     1695
Shellcode           378
Worms                44
dtype: int64

Sum:		 62994


In [None]:
labels = data['attack_cat'].unique()
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = data[column].astype(str)
    data[column] = label_encoders[column].fit_transform(data[column])

X = data.drop(['attack_cat', 'Label'], axis=1)
y = data['attack_cat']

In [None]:
# Scaling and standardize
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:
# Thiết lập các tham số cho XGBoost
xgb_params = {
    'max_depth': 5,           # Độ sâu tối đa của các cây quyết định
    'learning_rate': 0.1,     # Tốc độ học của mô hình
    'n_estimators': 20,       # Số lượng cây trong mô hình
    'objective': 'multi:softmax'  # Mục tiêu: dùng cho bài toán phân loại đa lớp
}

# Khởi tạo mô hình XGBoost với các tham số được thiết lập
clf = XGBClassifier(**xgb_params)

kf = KFold(shuffle=True)
Y_test = []
Y_pred = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test.extend(y_test)
    Y_pred.extend(y_pred)

report = classification_report(Y_test, Y_pred, output_dict=True)

table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:', report['accuracy'], '\tRecall:', report['macro avg']['recall'])


+---------+-------------+-----------+------------+-----------+
|   Class |   Precision |    Recall |   F1-Score |   Support |
|       0 |    0.559441 | 0.118168  |   0.195122 |       677 |
+---------+-------------+-----------+------------+-----------+
|       1 |    0.818182 | 0.0779896 |   0.142405 |       577 |
+---------+-------------+-----------+------------+-----------+
|       2 |    0.384225 | 0.537295  |   0.448047 |      4089 |
+---------+-------------+-----------+------------+-----------+
|       3 |    0.715152 | 0.818156  |   0.763194 |      7061 |
+---------+-------------+-----------+------------+-----------+
|       4 |    0.886575 | 0.907229  |   0.896783 |     12062 |
+---------+-------------+-----------+------------+-----------+
|       5 |    0.98273  | 0.816786  |   0.892107 |      5016 |
+---------+-------------+-----------+------------+-----------+
|       6 |    0.999774 | 0.984711  |   0.992185 |     31395 |
+---------+-------------+-----------+------------+-----

In [None]:
# Radar chart for full feature classification
encoded_labels = list(label_encoders['attack_cat'].classes_)
accuracies = [report[str(x)]['precision'] for x in range(len(encoded_labels))]

fig = go.Figure(data=go.Scatterpolar(
    r=accuracies,
    theta=encoded_labels,
    fill='toself'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        ),
    ),
    showlegend=False
)

fig.show()

In [None]:
!cp /content/drive/MyDrive/Best_chormo/UNSW_best_chromo.npy UNSW_best_chromo.npy

In [None]:
# Classification with top 10 features
best_chromo = np.load('UNSW_best_chromo.npy')
Xfs = X.iloc[:, best_chromo]

In [None]:
# Thiết lập các tham số cho XGBoost
xgb_params = {
    'max_depth': 5,           # Độ sâu tối đa của các cây quyết định
    'learning_rate': 0.1,     # Tốc độ học của mô hình
    'n_estimators': 20,       # Số lượng cây trong mô hình
    'objective': 'multi:softmax'  # Mục tiêu: dùng cho bài toán phân loại đa lớp
}

# Khởi tạo mô hình XGBoost với các tham số được thiết lập
clf = XGBClassifier(**xgb_params)

kf = KFold(shuffle=True)
Y_test_fs = []
Y_pred_fs = []

for train_index, test_index in kf.split(Xfs):
    X_train, X_test = Xfs.iloc[train_index], Xfs.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test_fs.extend(y_test)
    Y_pred_fs.extend(y_pred)

report_fs = classification_report(Y_test_fs, Y_pred_fs, output_dict=True)

table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_fs.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:', report_fs['accuracy'], '\tRecall:', report_fs['macro avg']['recall'])

+---------+-------------+-----------+------------+-----------+
|   Class |   Precision |    Recall |   F1-Score |   Support |
|       0 |    0.538462 | 0.0413589 |  0.0768176 |       677 |
+---------+-------------+-----------+------------+-----------+
|       1 |    0.357143 | 0.017331  |  0.0330579 |       577 |
+---------+-------------+-----------+------------+-----------+
|       2 |    0.505872 | 0.136953  |  0.21555   |      4089 |
+---------+-------------+-----------+------------+-----------+
|       3 |    0.670846 | 0.659255  |  0.665     |      7061 |
+---------+-------------+-----------+------------+-----------+
|       4 |    0.617266 | 0.951998  |  0.748932  |     12062 |
+---------+-------------+-----------+------------+-----------+
|       5 |    0.737107 | 0.729466  |  0.733267  |      5016 |
+---------+-------------+-----------+------------+-----------+
|       6 |    0.998901 | 0.984743  |  0.991772  |     31395 |
+---------+-------------+-----------+------------+-----

In [None]:
print("Keys in report_fs:", report_fs.keys())


Keys in report_fs: dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'accuracy', 'macro avg', 'weighted avg'])


In [None]:
# Radar chart for full feature classification
encoded_labels = list(label_encoders['attack_cat'].classes_)
accuracies = [report[str(x)]['precision'] for x in range(len(encoded_labels))]

fig = go.Figure(data=go.Scatterpolar(
    r=accuracies,
    theta=encoded_labels,
    fill='toself'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        ),
    ),
    showlegend=False
)

fig.show()

In [None]:
# Save model
from joblib import dump
dump(clf, 'nt204_xgboost_UNSW_GA.joblib')

['nt204_xgboost_UNSW_GA.joblib']

In [None]:
!cp nt204_xgboost_UNSW_GA.joblib /content/drive/MyDrive/Best_chormo