In [None]:
import matplotlib.pyplot as plt
import numpy as np
import gdown
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, classification_report
import warnings
import zipfile
from sklearn.ensemble import RandomForestClassifier
from tabulate import tabulate
from sklearnex import patch_sklearn

# Ignore all warnings
warnings.filterwarnings("ignore")

# Patch scikit-learn
patch_sklearn()


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [None]:
url = 'https://drive.google.com/uc?id=1Brk0WwZ9ErOdyHTtOEP9-NlXHSI9ieB8'
output = 'DoHBrw-2020.zip'
gdown.download(url, output, quiet=False)

with zipfile.ZipFile('DoHBrw-2020.zip', 'r') as zip_ref:
    zip_ref.extractall()

with zipfile.ZipFile('CSVs/Total_CSVs.zip', 'r') as zip_ref:
    zip_ref.extractall()


Downloading...
From (original): https://drive.google.com/uc?id=1Brk0WwZ9ErOdyHTtOEP9-NlXHSI9ieB8
From (redirected): https://drive.google.com/uc?id=1Brk0WwZ9ErOdyHTtOEP9-NlXHSI9ieB8&confirm=t&uuid=d3b3b12d-c30d-47a0-afdb-cdc4c2200e26
To: /content/DoHBrw-2020.zip
100%|██████████| 808M/808M [00:10<00:00, 78.9MB/s]


In [None]:
file_paths = ['l1-doh.csv', 'l1-nondoh.csv', 'l2-benign.csv', 'l2-malicious.csv']
dfs = [pd.read_csv(file) for file in file_paths]
data = pd.concat(dfs, ignore_index=True)
data = data.drop_duplicates()
data = data.fillna(0)


In [None]:
labels = data['Label'].unique()
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = data[column].astype(str)
    data[column] = label_encoders[column].fit_transform(data[column])

X = data.drop(columns=['Label'])
y = label_encoders['Label'].inverse_transform(data['Label'])

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:
label_encoders = LabelEncoder()
y = label_encoders.fit_transform(y)

In [None]:
from xgboost import XGBClassifier

# Thiết lập các tham số cho XGBoost
xgb_params = {
    'max_depth': 5,           # Độ sâu tối đa của các cây quyết định
    'learning_rate': 0.1,     # Tốc độ học của mô hình
    'n_estimators': 20,       # Số lượng cây trong mô hình
    'objective': 'multi:softmax'  # Mục tiêu: dùng cho bài toán phân loại đa lớp
}

# Khởi tạo mô hình XGBoost với các tham số được thiết lập
clf = XGBClassifier(**xgb_params)

kf = KFold(shuffle=True)
Y_test = []
Y_pred = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test.extend(y_test)
    Y_pred.extend(y_pred)

report = classification_report(Y_test, Y_pred, output_dict=True)

table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:', report['accuracy'], '\tRecall:', report['macro avg']['recall'])


+---------+-------------+----------+------------+-----------+
|   Class |   Precision |   Recall |   F1-Score |   Support |
|       0 |    0.496807 | 0.628515 |   0.554954 |     19807 |
+---------+-------------+----------+------------+-----------+
|       1 |    0.482747 | 0.38831  |   0.430409 |    269643 |
+---------+-------------+----------+------------+-----------+
|       2 |    0.487673 | 0.567913 |   0.524743 |    249836 |
+---------+-------------+----------+------------+-----------+
|       3 |    0.992685 | 0.999754 |   0.996207 |    897493 |
+---------+-------------+----------+------------+-----------+

Accuracy: 0.8047939175057542 	Recall: 0.6461228021525567


In [None]:
import plotly.graph_objects as go

# Radar chart for full feature classification
labels = label_encoders.inverse_transform(np.unique(y))  # Get the class labels
accuracies = [report[str(x)]['precision'] for x in range(len(labels))]  # Use labels instead of encoded_labels

fig = go.Figure(data=go.Scatterpolar(
    r=accuracies,
    theta=labels,
    fill='toself'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        ),
    ),
    showlegend=False
)

fig.show()


In [None]:
!cp /content/drive/MyDrive/Best_chormo/Doh_best_chromo.npy Doh_best_chromo.npy

In [None]:
# Classification with top 10 features
best_chromo = np.load('Doh_best_chromo.npy')
Xfs = X.iloc[:, best_chromo]

xgb_params = {
    'max_depth': 5,           # Độ sâu tối đa của các cây quyết định
    'learning_rate': 0.1,     # Tốc độ học của mô hình
    'n_estimators': 20,       # Số lượng cây trong mô hình
    'objective': 'multi:softmax'  # Mục tiêu: dùng cho bài toán phân loại đa lớp
}

# Khởi tạo mô hình XGBoost với các tham số được thiết lập
clf = XGBClassifier(**xgb_params)

kf = KFold(shuffle=True)
Y_test_fs = []
Y_pred_fs = []

for train_index, test_index in kf.split(Xfs):
    X_train, X_test = Xfs.iloc[train_index], Xfs.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test_fs.extend(y_test)
    Y_pred_fs.extend(y_pred)

report_fs = classification_report(Y_test_fs, Y_pred_fs, output_dict=True)

table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_fs.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:', report_fs['accuracy'], '\tRecall:', report_fs['macro avg']['recall'])


+---------+-------------+-----------+------------+-----------+
|   Class |   Precision |    Recall |   F1-Score |   Support |
|       0 |    0.48826  | 0.0356945 |  0.0665255 |     19807 |
+---------+-------------+-----------+------------+-----------+
|       1 |    0.485726 | 0.47198   |  0.478754  |    269643 |
+---------+-------------+-----------+------------+-----------+
|       2 |    0.484736 | 0.47291   |  0.47875   |    249836 |
+---------+-------------+-----------+------------+-----------+
|       3 |    0.965467 | 0.999982  |  0.982422  |    897493 |
+---------+-------------+-----------+------------+-----------+

Accuracy: 0.7959470454398345 	Recall: 0.49514161765600007


In [None]:
import plotly.graph_objects as go

# Radar chart for full feature classification
labels = label_encoders.inverse_transform(np.unique(y))  # Get the class labels
accuracies = [report[str(x)]['precision'] for x in range(len(labels))]  # Use labels instead of encoded_labels

fig = go.Figure(data=go.Scatterpolar(
    r=accuracies,
    theta=labels,
    fill='toself'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        ),
    ),
    showlegend=False
)

fig.show()


In [None]:
# Save model
from joblib import dump
dump(clf, 'nt204_xgboost_doh.joblib')

['nt204_xgboost_doh.joblib']

In [None]:
!cp nt204_xgboost_doh.joblib /content/drive/MyDrive/Best_chormo