In [None]:
# Libary
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import gdown
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, classification_report
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
import zipfile

# Ignore all warnings
warnings.filterwarnings("ignore")

#!pip install scikit-learn-intelex -q --progress-bar off
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)




---
Dataset preparation


In [None]:
## Download dataset

url = 'https://drive.google.com/uc?id=1Brk0WwZ9ErOdyHTtOEP9-NlXHSI9ieB8'

# Path where you want to save the downloaded file
output = 'DoHBrw-2020.zip'

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1Brk0WwZ9ErOdyHTtOEP9-NlXHSI9ieB8
From (redirected): https://drive.google.com/uc?id=1Brk0WwZ9ErOdyHTtOEP9-NlXHSI9ieB8&confirm=t&uuid=9869f0a4-77a9-48df-8910-2c374f0ec62e
To: /content/DoHBrw-2020.zip
100%|██████████| 808M/808M [00:03<00:00, 242MB/s]


'DoHBrw-2020.zip'

In [None]:
# Unzip dataset
with zipfile.ZipFile('DoHBrw-2020.zip', 'r') as zip_ref:
  zip_ref.extractall()

with zipfile.ZipFile('CSVs/Total_CSVs.zip', 'r') as zip_ref:
  zip_ref.extractall()

Archive:  DoHBrw-2020.zip
   creating: CSVs/
  inflating: CSVs/BenignDoH-NonDoH-CSVs.md5  
  inflating: CSVs/BenignDoH-NonDoH-CSVs.zip  
  inflating: CSVs/MaliciousDoH-CSVs.md5  
  inflating: CSVs/MaliciousDoH-CSVs.zip  
  inflating: CSVs/Total_CSVs.md5     
  inflating: CSVs/Total_CSVs.zip     
Archive:  CSVs/Total_CSVs.zip
  inflating: l1-doh.csv              
  inflating: l1-nondoh.csv           
  inflating: l2-benign.csv           
  inflating: l2-malicious.csv        


In [None]:
## Load dataset for learning
file_paths = ['l1-doh.csv', 'l1-nondoh.csv', 'l2-benign.csv', 'l2-malicious.csv']

# Read each CSV file into separate DataFrames
dfs = [pd.read_csv(file) for file in file_paths]

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)
data = data.drop_duplicates()
data = data.fillna(0)

In [None]:
# Number of features
print('Number of features: ',data.shape[1])

Number of features:  35


In [None]:
# Record per class
print('Record per class:\n',data.groupby('Label').size())
print('\nSum:\t\t',data['Label'].size)

Record per class:
 Label
Benign        19807
DoH          269643
Malicious    249836
NonDoH       897493
dtype: int64

Sum:		 1436779





Dataset preparation


---






---

Data preprocessing

In [None]:
# Encode categorical variables into numeric values
labels = data['Label'].unique()
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = data[column].astype(str)
    data[column] = label_encoders[column].fit_transform(data[column])

In [None]:
X = data.drop(columns=['Label'])
y = label_encoders['Label'].inverse_transform(data['Label'])

In [None]:
print('Number of train features: ',X.shape[1])

Number of train features:  34


In [None]:
## Scaling and standardlize
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)


Data preprocessing


---





---

Classification full feature

In [None]:
## Train classifier
clf = KNeighborsClassifier(n_neighbors=9, n_jobs=-1)

In [None]:
## Train classifier
kf = KFold(shuffle=True)
Y_test = []
Y_pred = []
# Train the SVM model
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test.extend(y_test)
    Y_pred.extend(y_pred)

report = classification_report(Y_test, Y_pred, output_dict=True)

In [None]:
from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:',report['accuracy'],'\tRecall:',report['macro avg']['recall'])

+-----------+-------------+----------+------------+-----------+
| Class     |   Precision |   Recall |   F1-Score |   Support |
| Benign    |    0.23786  | 0.216641 |   0.226755 |     19807 |
+-----------+-------------+----------+------------+-----------+
| DoH       |    0.295223 | 0.329421 |   0.311386 |    269643 |
+-----------+-------------+----------+------------+-----------+
| Malicious |    0.240027 | 0.210882 |   0.224513 |    249836 |
+-----------+-------------+----------+------------+-----------+
| NonDoH    |    0.996894 | 0.997858 |   0.997376 |    897493 |
+-----------+-------------+----------+------------+-----------+

Accuracy: 0.7247976202324783 	Recall: 0.43870053804266185


In [None]:
# Create an array of accuracy values
accuracies = [report[x]['precision'] for x in labels]

## Export radar chart for report
import plotly.io
plotly.io.renderers.default = "colab"
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=accuracies,
  theta=labels,
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()





Classification full feature


---





---

Classification 10 feature

In [None]:
best_chromo = np.load('Doh_best_chromo.npy')

In [None]:
Xfs = X.iloc[:,best_chromo]

In [None]:
## Train classifier
clf = KNeighborsClassifier(n_neighbors=9, n_jobs=-1)

In [None]:
## Train classifier
kf = KFold(shuffle=True)
Y_test_fs = []
Y_pred_fs = []
# Train the SVM model
for train_index, test_index in kf.split(Xfs):
    X_train, X_test = Xfs.iloc[train_index], Xfs.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    Y_test_fs.extend(y_test)
    Y_pred_fs.extend(y_pred)

report_fs = classification_report(Y_test_fs, Y_pred_fs, output_dict=True)

In [None]:
from tabulate import tabulate
table = [["Class", "Precision", "Recall", "F1-Score", "Support"]]
for key, value in report_fs.items():
    if key != "accuracy" and key != "macro avg" and key != "weighted avg":
        table.append([key, value['precision'], value['recall'], value['f1-score'], value['support']])

print(tabulate(table, headers="firstrow", tablefmt="grid"))
print('\nAccuracy:',report_fs['accuracy'],'\tRecall:',report_fs['macro avg']['recall'])

+-----------+-------------+----------+------------+-----------+
| Class     |   Precision |   Recall |   F1-Score |   Support |
| Benign    |    0.231798 | 0.155753 |   0.186315 |     19807 |
+-----------+-------------+----------+------------+-----------+
| DoH       |    0.280049 | 0.285299 |   0.28265  |    269643 |
+-----------+-------------+----------+------------+-----------+
| Malicious |    0.260275 | 0.248471 |   0.254236 |    249836 |
+-----------+-------------+----------+------------+-----------+
| NonDoH    |    0.983856 | 0.99786  |   0.990809 |    897493 |
+-----------+-------------+----------+------------+-----------+

Accuracy: 0.7222147595420033 	Recall: 0.4218457646037898


In [None]:
# Create an array of accuracy values
accuracies = [report_fs[x]['precision'] for x in labels]

## Export radar chart for report
import plotly.io
plotly.io.renderers.default = "colab"
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=accuracies,
  theta=labels,
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()





Classification 10 feature


---

