In [None]:
!pip install scikit-learn-intelex

Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2024.4.0-py310-none-manylinux1_x86_64.whl (142 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.2/142.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting daal4py==2024.4.0 (from scikit-learn-intelex)
  Downloading daal4py-2024.4.0-py310-none-manylinux1_x86_64.whl (10.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Collecting daal==2024.4.0 (from daal4py==2024.4.0->scikit-learn-intelex)
  Downloading daal-2024.4.0-py2.py3-none-manylinux1_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: daal, daal4py, scikit-learn-intelex
Successfully installed daal-2024.4.0 daal4py-2024.4.0 scikit-learn-intelex-2024.4.0


In [None]:
# Importing libraries
import matplotlib.pyplot as plt
import numpy as np
import gdown
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, classification_report
import time
from sklearn.ensemble import RandomForestClassifier
import warnings
import zipfile

# Ignoring warnings
warnings.filterwarnings("ignore")

# Installing XGBoost library
!pip install xgboost




In [None]:
# Download dataset
url = 'https://drive.google.com/uc?id=1ELj3s17Zpy5KmMzGSNh8jDYFWJ909K5N'

# Path where you want to save the downloaded file
output = 'Bot_iot_0_05.zip'

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1ELj3s17Zpy5KmMzGSNh8jDYFWJ909K5N
From (redirected): https://drive.google.com/uc?id=1ELj3s17Zpy5KmMzGSNh8jDYFWJ909K5N&confirm=t&uuid=62edcd40-1f7f-4e64-9192-864de5c6dfad
To: /content/Bot_iot_0_05.zip
100%|██████████| 57.2M/57.2M [00:00<00:00, 66.0MB/s]


'Bot_iot_0_05.zip'

In [None]:
# Unzip dataset
with zipfile.ZipFile('Bot_iot_0_05.zip', 'r') as zip_ref:
  zip_ref.extractall()

In [None]:
## Load dataset for learning
file_paths = ['reduced_data_{}.csv'.format(x) for x in range(1, 4+1)]

# Read each CSV file into separate DataFrames
dfs = [pd.read_csv(file) for file in file_paths]

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

data = data.drop(columns=['pkSeqID','stime','ltime','flgs_number','proto_number',
                          'saddr','sport','daddr','pkts','bytes','state_number',
                          'seq','dur','rate'])

In [None]:
def balance_class(df, cls_col, cls, cls_size):
    resampled_dfs = [df[df[cls_col] != cls]]
    cls_df = df[df[cls_col] == cls]
    current_class_size = len(cls_df)

    if current_class_size > cls_size:
        # Undersample: Reduce the number of samples
        cls_df_resampled = cls_df.sample(cls_size, random_state=42)
    elif current_class_size < cls_size:
        # Oversample: Increase the number of samples
        cls_df_resampled = cls_df.sample(cls_size, replace=True, random_state=42)
    else:
        cls_df_resampled = cls_df

    resampled_dfs.append(cls_df_resampled)

    return pd.concat(resampled_dfs).reset_index(drop=True)


data = balance_class(data, 'category', 'DDoS', 240000)
data = balance_class(data, 'category', 'DoS', 242788)
data = balance_class(data, 'category', 'Reconnaissance', 182166)
data = balance_class(data, 'category', 'Theft', 160)

# Number of features
print('Number of features: ',data.shape[1])

# Records per class
print('Records per class:\n',data.groupby('category').size())
print('\nTotal Records:\t\t',data['category'].size)

Number of features:  32
Records per class:
 category
DDoS              240000
DoS               242788
Normal               477
Reconnaissance    182166
Theft                160
dtype: int64

Total Records:		 665591


In [None]:
# Encode categorical variables into numeric values
labels = data['category'].unique()
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = data[column].astype(str)
    data[column] = label_encoders[column].fit_transform(data[column])

In [None]:
X = data.drop(columns=['attack', 'category', 'subcategory'])
y = label_encoders['category'].inverse_transform(data['category'])

In [None]:
print('Number of train features: ',X.shape[1])

Number of train features:  29


In [None]:
## Scaling and standardlize
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

In [None]:
# Encode target variable y into numeric values
label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

# Define encoded labels
encoded_labels = label_encoder_y.classes_

In [None]:
# Thiết lập các tham số cho XGBoost
xgb_params = {
    'max_depth': 5,           # Độ sâu tối đa của các cây quyết định
    'learning_rate': 0.1,     # Tốc độ học của mô hình
    'n_estimators': 20,       # Số lượng cây trong mô hình
    'objective': 'multi:softmax'  # Mục tiêu: dùng cho bài toán phân loại đa lớp
}

# Khởi tạo mô hình XGBoost với các tham số được thiết lập
clf_xgb = XGBClassifier(**xgb_params)

kf = KFold(shuffle=True)
Y_test_xgb = []
Y_pred_xgb = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]  # Use encoded labels

    clf_xgb.fit(X_train, y_train)
    y_pred = clf_xgb.predict(X_test)

    Y_test_xgb.extend(y_test)
    Y_pred_xgb.extend(y_pred)

report_xgb = classification_report(Y_test_xgb, Y_pred_xgb, output_dict=True)

from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_xgb.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:', report_xgb['accuracy'], '\tRecall:', report_xgb['macro avg']['recall'])

+---------+-------------+----------+------------+-----------+
|   Class |   Precision |   Recall |   F1-Score |   Support |
|       0 |    0.99796  | 0.978263 |   0.988013 |    240000 |
+---------+-------------+----------+------------+-----------+
|       1 |    0.979075 | 0.997875 |   0.988385 |    242788 |
+---------+-------------+----------+------------+-----------+
|       2 |    0.982869 | 0.962264 |   0.972458 |       477 |
+---------+-------------+----------+------------+-----------+
|       3 |    0.999358 | 0.999896 |   0.999627 |    182166 |
+---------+-------------+----------+------------+-----------+
|       4 |    1        | 0.91875  |   0.957655 |       160 |
+---------+-------------+----------+------------+-----------+

Accuracy: 0.991311481074714 	Recall: 0.9714094078998627


In [None]:
import plotly.graph_objects as go
# Radar chart for full feature classification
accuracies = [report_xgb[str(x)]['precision'] for x in range(len(encoded_labels))]  # Use report_xgb here

fig = go.Figure(data=go.Scatterpolar(
    r=accuracies,
    theta=encoded_labels,
    fill='toself'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        ),
    ),
    showlegend=False
)

fig.show()

In [None]:
!cp /content/drive/MyDrive/Best_chormo/BOT_IOT_005_best_chromo.npy BOT_IOT_005_best_chromo.npy

In [None]:
best_chromo = np.load('BOT_IOT_005_best_chromo.npy')
Xfs = X.iloc[:, best_chromo]

In [None]:
# Thiết lập các tham số cho XGBoost
xgb_params = {
    'max_depth': 5,           # Độ sâu tối đa của các cây quyết định
    'learning_rate': 0.1,     # Tốc độ học của mô hình
    'n_estimators': 20,       # Số lượng cây trong mô hình
    'objective': 'multi:softmax'  # Mục tiêu: dùng cho bài toán phân loại đa lớp
}

# Khởi tạo mô hình XGBoost với các tham số được thiết lập
clf_xgb = XGBClassifier(**xgb_params)

kf = KFold(shuffle=True)
Y_test_xgb = []
Y_pred_xgb = []

for train_index, test_index in kf.split(Xfs):
    X_train, X_test = Xfs.iloc[train_index], Xfs.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]  # Use encoded labels

    clf_xgb.fit(X_train, y_train)
    y_pred = clf_xgb.predict(X_test)

    Y_test_xgb.extend(y_test)
    Y_pred_xgb.extend(y_pred)

report_xgb = classification_report(Y_test_xgb, Y_pred_xgb, output_dict=True)

from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_xgb.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:', report_xgb['accuracy'], '\tRecall:', report_xgb['macro avg']['recall'])

+---------+-------------+----------+------------+-----------+
|   Class |   Precision |   Recall |   F1-Score |   Support |
|       0 |    0.960512 | 0.978842 |   0.96959  |    240000 |
+---------+-------------+----------+------------+-----------+
|       1 |    0.978772 | 0.961139 |   0.969875 |    242788 |
+---------+-------------+----------+------------+-----------+
|       2 |    0.977221 | 0.899371 |   0.936681 |       477 |
+---------+-------------+----------+------------+-----------+
|       3 |    0.999286 | 0.998419 |   0.998852 |    182166 |
+---------+-------------+----------+------------+-----------+
|       4 |    0.993333 | 0.93125  |   0.96129  |       160 |
+---------+-------------+----------+------------+-----------+

Accuracy: 0.9776739769618279 	Recall: 0.953804139256372


In [None]:
import plotly.graph_objects as go
# Radar chart for full feature classification
accuracies = [report_xgb[str(x)]['precision'] for x in range(len(encoded_labels))]  # Use report_xgb here

fig = go.Figure(data=go.Scatterpolar(
    r=accuracies,
    theta=encoded_labels,
    fill='toself'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        ),
    ),
    showlegend=False
)

fig.show()

In [None]:
# Save model
from joblib import dump
dump(clf_xgb, 'nt204_Xgboost_BOT_IOT_005.joblib')

['nt204_Xgboost_BOT_IOT_005.joblib']

In [None]:
!cp nt204_Xgboost_BOT_IOT_005.joblib /content/drive/MyDrive/Best_chormo