# Desempenho de algoritmos de classificação na detecção de intrusão em redes de dispositivos IoT

## Importações

In [1]:
import glob
import os
import time
from dataclasses import dataclass

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Variáveis globais

In [2]:
DATASET_DIRECTORY = "./CICIoT2023/"
GENERATE_DATASETS = False
EIGHT_CLASSES = True

## DTOs

In [3]:
@dataclass
class ModelDataset:
    x_train: np.ndarray
    y_train: np.ndarray
    x_test: pl.DataFrame
    y_test: np.ndarray

## Funções auxiliares

In [4]:
def sample_rows(
    group_by: pl.dataframe.group_by.GroupBy, percentage: float
) -> pl.DataFrame:
    dfs = []
    for _, df in group_by:
        if len(df) * percentage <= 1:
            dfs.append(df.sample(n=1))
            continue
        dfs.append(df.sample(fraction=percentage, seed=42))

    return pl.concat(dfs)


def generate_dataframe(file_list: list, percentage: float) -> pl.DataFrame:
    dfs = []
    for file in file_list:
        df = pl.read_csv(file)
        dfs.append(
            sample_rows(
                group_by=df.group_by(["label"], maintain_order=True),
                percentage=percentage,
            )
        )
    return pl.concat(dfs)

# Criando um dataset menor

In [5]:
file_dir = "./data/"
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

df_sets = sorted(glob.glob(os.path.join(DATASET_DIRECTORY, "*.csv")))

In [6]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.005).write_csv(
        file="./data/dataset_dev.csv"
    )

In [7]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.01).write_csv(
        file="./data/dataset_one_percent.csv"
    )

# IDS

## Dataset

In [8]:
df = pl.read_csv("./data/dataset_dev.csv")
training_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

### Separando Features e Label

In [9]:
X_columns = df.columns[:-1]
y_column = "label"

In [10]:
del df

### 8 Classes

In [11]:
if EIGHT_CLASSES:
    eight_classes = {}

    eight_classes["BenignTraffic"] = "Benign"

    eight_classes["DDoS-RSTFINFlood"] = "DDoS"
    eight_classes["DDoS-PSHACK_Flood"] = "DDoS"
    eight_classes["DDoS-SYN_Flood"] = "DDoS"
    eight_classes["DDoS-UDP_Flood"] = "DDoS"
    eight_classes["DDoS-TCP_Flood"] = "DDoS"
    eight_classes["DDoS-ICMP_Flood"] = "DDoS"
    eight_classes["DDoS-SynonymousIP_Flood"] = "DDoS"
    eight_classes["DDoS-ACK_Fragmentation"] = "DDoS"
    eight_classes["DDoS-UDP_Fragmentation"] = "DDoS"
    eight_classes["DDoS-ICMP_Fragmentation"] = "DDoS"
    eight_classes["DDoS-SlowLoris"] = "DDoS"
    eight_classes["DDoS-HTTP_Flood"] = "DDoS"

    eight_classes["DoS-UDP_Flood"] = "DoS"
    eight_classes["DoS-SYN_Flood"] = "DoS"
    eight_classes["DoS-TCP_Flood"] = "DoS"
    eight_classes["DoS-HTTP_Flood"] = "DoS"

    eight_classes["Mirai-greeth_flood"] = "Mirai"
    eight_classes["Mirai-greip_flood"] = "Mirai"
    eight_classes["Mirai-udpplain"] = "Mirai"

    eight_classes["Recon-PingSweep"] = "Recon"
    eight_classes["Recon-OSScan"] = "Recon"
    eight_classes["Recon-PortScan"] = "Recon"
    eight_classes["VulnerabilityScan"] = "Recon"
    eight_classes["Recon-HostDiscovery"] = "Recon"

    eight_classes["DNS_Spoofing"] = "Spoofing"
    eight_classes["MITM-ArpSpoofing"] = "Spoofing"

    eight_classes["BrowserHijacking"] = "Web"
    eight_classes["Backdoor_Malware"] = "Web"
    eight_classes["XSS"] = "Web"
    eight_classes["Uploading_Attack"] = "Web"
    eight_classes["SqlInjection"] = "Web"
    eight_classes["CommandInjection"] = "Web"

    eight_classes["DictionaryBruteForce"] = "BruteForce"

    training_data = training_data.with_columns(
        pl.col(y_column).replace(eight_classes, default=-1)
    )
    test_data = test_data.with_columns(
        pl.col(y_column).replace(eight_classes, default=-1)
    )

## Classes

### Separando as colunas

In [12]:
X_train = training_data.select(X_columns)
y_train = training_data.select(y_column)
X_test = test_data.select(X_columns)
y_test = test_data.select(y_column).to_numpy()

### Balanceando o dataset

In [13]:
training_data.group_by("label").len()

label,len
str,u32
"""Benign""",4379
"""Mirai""",10310
"""Web""",809
"""DDoS""",135193
"""BruteForce""",127
"""Recon""",1362
"""DoS""",32159
"""Spoofing""",1814


In [14]:
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)

pd_X_resampled, pd_y_resampled = ros.fit_resample(
    X_train.to_pandas(), y_train.to_pandas()
)
X_resampled, y_resampled = pl.from_pandas(pd_X_resampled), pl.from_pandas(
    pd_y_resampled
)

### Normalizando o dataset

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Testando os algoritmos

## Escopo original
Aqui serão executados os algoritmos de classificação Logistic Regression, Perceptron, Adaboost e Random Forest, presentes no trabalho de Neto, E. C. P. *et al.*

In [16]:
data = ModelDataset(
    x_train=X_train_scaled,
    y_train=y_resampled.to_numpy(),
    x_test=X_test_scaled,
    y_test=y_test,
)

### Logistic Regression

In [17]:
logistic_regression = LogisticRegression(n_jobs=-1, max_iter=10000)
logistic_regression.fit(data.x_train, data.y_train)
y_pred = logistic_regression.predict(data.x_test)

print(classification_report(data.y_test, y_pred, target_names=np.unique(y_pred)))

accuracy = accuracy_score(data.y_test, y_pred)
precision = precision_score(data.y_test, y_pred, average="macro")
recall = recall_score(data.y_test, y_pred, average="macro")
f1 = f1_score(data.y_test, y_pred, average="macro")

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

      Benign       0.83      0.69      0.75      1027
  BruteForce       0.05      0.48      0.10        42
        DDoS       0.97      0.61      0.75     33962
         DoS       0.36      0.89      0.51      7935
       Mirai       1.00      0.99      1.00      2604
       Recon       0.22      0.38      0.27       319
    Spoofing       0.57      0.57      0.57       445
         Web       0.28      0.53      0.36       205

    accuracy                           0.68     46539
   macro avg       0.53      0.64      0.54     46539
weighted avg       0.85      0.68      0.71     46539

Accuracy: 0.6797739530286426, Precision: 0.5329021608217444, Recall: 0.6422790912421958, F1: 0.5386325485127118


### Perceptron

In [18]:
perceptron = Perceptron()
perceptron.fit(data.x_train, data.y_train)
y_pred = perceptron.predict(data.x_test)

print(classification_report(data.y_test, y_pred, target_names=np.unique(y_pred)))

accuracy = accuracy_score(data.y_test, y_pred)
precision = precision_score(data.y_test, y_pred, average="macro")
recall = recall_score(data.y_test, y_pred, average="macro")
f1 = f1_score(data.y_test, y_pred, average="macro")

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

      Benign       0.82      0.34      0.48      1027
  BruteForce       0.05      0.36      0.08        42
        DDoS       0.93      0.62      0.74     33962
         DoS       0.34      0.80      0.48      7935
       Mirai       0.95      0.98      0.97      2604
       Recon       0.17      0.38      0.23       319
    Spoofing       0.38      0.53      0.44       445
         Web       0.14      0.39      0.20       205

    accuracy                           0.66     46539
   macro avg       0.47      0.55      0.45     46539
weighted avg       0.81      0.66      0.69     46539

Accuracy: 0.6585229592384881, Precision: 0.4700889519102317, Recall: 0.5483566276220029, F1: 0.4517923833407661


### AdaBoost

In [19]:
ada_boost = AdaBoostClassifier()
ada_boost.fit(data.x_train, data.y_train)
y_pred = ada_boost.predict(data.x_test)

print(classification_report(data.y_test, y_pred, target_names=np.unique(y_pred)))

accuracy = accuracy_score(data.y_test, y_pred)
precision = precision_score(data.y_test, y_pred, average="macro")
recall = recall_score(data.y_test, y_pred, average="macro")
f1 = f1_score(data.y_test, y_pred, average="macro")

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

      Benign       0.91      0.70      0.80      1027
  BruteForce       1.00      0.43      0.60        42
        DDoS       0.93      0.87      0.90     33962
         DoS       0.57      0.69      0.62      7935
       Mirai       0.99      1.00      1.00      2604
       Recon       0.33      0.54      0.41       319
    Spoofing       0.38      0.24      0.29       445
         Web       0.24      0.89      0.38       205

    accuracy                           0.83     46539
   macro avg       0.67      0.67      0.62     46539
weighted avg       0.86      0.83      0.84     46539

Accuracy: 0.834633318292185, Precision: 0.6686454409898148, Recall: 0.6694657334516874, F1: 0.6236227412042501


### Random Forest

In [20]:
random_forest = RandomForestClassifier()
random_forest.fit(data.x_train, data.y_train)
y_pred = random_forest.predict(data.x_test)

print(classification_report(data.y_test, y_pred, target_names=np.unique(y_pred)))

accuracy = accuracy_score(data.y_test, y_pred)
precision = precision_score(data.y_test, y_pred, average="macro")
recall = recall_score(data.y_test, y_pred, average="macro")
f1 = f1_score(data.y_test, y_pred, average="macro")

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

      Benign       0.82      0.97      0.89      1027
  BruteForce       0.91      0.24      0.38        42
        DDoS       1.00      1.00      1.00     33962
         DoS       1.00      1.00      1.00      7935
       Mirai       1.00      1.00      1.00      2604
       Recon       0.80      0.68      0.74       319
    Spoofing       0.87      0.75      0.81       445
         Web       0.69      0.57      0.62       205

    accuracy                           0.99     46539
   macro avg       0.88      0.78      0.80     46539
weighted avg       0.99      0.99      0.99     46539

Accuracy: 0.9910827478029179, Precision: 0.8848609386238638, Recall: 0.7751117113417976, F1: 0.8028439075494131
