# Desempenho de algoritmos de classificação na detecção de intrusão em redes de dispositivos IoT

## Importações

In [1]:
import glob
import time
from dataclasses import dataclass
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

## Variáveis globais

In [9]:
DATASET_DIRECTORY = "../CICIoT2023/"
GENERATE_DATASETS = False
NO_CLASSES = 34
DATASET_FILE = "../data/dataset_five_percent.csv"
RESULTS_FILE = "../results_five_percent.csv"
FEATURE_SELECTION = False

In [3]:
assert NO_CLASSES in [2, 8, 34]

## DTOs

In [4]:
@dataclass
class ModelDataset:
    x_train: np.ndarray
    y_train: np.ndarray
    x_test: pl.DataFrame
    y_test: np.ndarray

In [5]:
@dataclass
class Results:
    model: str
    accuracy_score: float
    precision_score: float
    recall_score: float
    f1_score: float
    train_num_rows: int
    test_num_rows: int
    num_features: int
    duration_training: float
    feature_selection: bool

## Funções auxiliares

In [6]:
def sample_rows(
    group_by: pl.dataframe.group_by.GroupBy, percentage: float
) -> pl.DataFrame:
    dfs = []
    for _, df in group_by:
        if len(df) * percentage <= 1:
            dfs.append(df.sample(n=1))
            continue
        dfs.append(df.sample(fraction=percentage, seed=42))

    return pl.concat(dfs)


def generate_dataframe(file_list: list, percentage: float) -> pl.DataFrame:
    dfs = []
    for file in file_list:
        df = pl.read_csv(file)
        dfs.append(
            sample_rows(
                group_by=df.group_by(["label"], maintain_order=True),
                percentage=percentage,
            )
        )
    return pl.concat(dfs)

In [None]:
def correlation(dataset: pl.DataFrame, threshold: float) -> list:
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix[i, j]) > threshold:
                col_name = corr_matrix.columns[i]
                col_corr.add(col_name)
    return list(col_corr)

In [None]:
def evaluate_model(model, data: ModelDataset):
    start_training = time.time()
    model.fit(data.x_train, data.y_train.ravel())
    end_training = time.time()

    y_pred = model.predict(data.x_test)

    return Results(
        model=type(model).__name__,
        accuracy_score=accuracy_score(data.y_test, y_pred),
        precision_score=precision_score(data.y_test, y_pred, average="macro"),
        recall_score=recall_score(data.y_test, y_pred, average="macro"),
        f1_score=f1_score(data.y_test, y_pred, average="macro"),
        train_num_rows=len(data.x_train),
        test_num_rows=len(data.x_test),
        num_features=len(np.unique(data.y_train)),
        duration_training=float(f"{end_training - start_training:.4f}"),
        feature_selection=FEATURE_SELECTION,
    )

In [None]:
def save_results(results: Results):
    df = pl.DataFrame(vars(results))

    file_path = Path(RESULTS_FILE)

    if file_path.exists():
        df_already_exists = pl.read_csv(file_path)
        result = pl.concat([df_already_exists, df])
        result.write_csv(file_path)
    else:
        df.write_csv(file_path)

# Criando um dataset menor

In [7]:
data_dir = Path("./data/")
data_dir.mkdir(parents=True, exist_ok=True)

df_sets = sorted(glob.glob(f"{DATASET_DIRECTORY}/*.csv"))

In [None]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.005).write_csv(
        file="./data/dataset_dev.csv"
    )

In [None]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.01).write_csv(
        file="./data/dataset_one_percent.csv"
    )

In [None]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.05).write_csv(
        file="./data/dataset_five_percent.csv"
    )

# IDS

## Dataset

In [10]:
df = pl.read_csv(DATASET_FILE)
training_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

### Separando Features e Label

In [None]:
X_columns = df.columns[:-1]
y_column = "label"

In [None]:
del df

### 8 Classes

In [None]:
if NO_CLASSES == 8:
    eight_classes = {}

    eight_classes["BenignTraffic"] = "Benign"

    eight_classes["DDoS-RSTFINFlood"] = "DDoS"
    eight_classes["DDoS-PSHACK_Flood"] = "DDoS"
    eight_classes["DDoS-SYN_Flood"] = "DDoS"
    eight_classes["DDoS-UDP_Flood"] = "DDoS"
    eight_classes["DDoS-TCP_Flood"] = "DDoS"
    eight_classes["DDoS-ICMP_Flood"] = "DDoS"
    eight_classes["DDoS-SynonymousIP_Flood"] = "DDoS"
    eight_classes["DDoS-ACK_Fragmentation"] = "DDoS"
    eight_classes["DDoS-UDP_Fragmentation"] = "DDoS"
    eight_classes["DDoS-ICMP_Fragmentation"] = "DDoS"
    eight_classes["DDoS-SlowLoris"] = "DDoS"
    eight_classes["DDoS-HTTP_Flood"] = "DDoS"

    eight_classes["DoS-UDP_Flood"] = "DoS"
    eight_classes["DoS-SYN_Flood"] = "DoS"
    eight_classes["DoS-TCP_Flood"] = "DoS"
    eight_classes["DoS-HTTP_Flood"] = "DoS"

    eight_classes["Mirai-greeth_flood"] = "Mirai"
    eight_classes["Mirai-greip_flood"] = "Mirai"
    eight_classes["Mirai-udpplain"] = "Mirai"

    eight_classes["Recon-PingSweep"] = "Recon"
    eight_classes["Recon-OSScan"] = "Recon"
    eight_classes["Recon-PortScan"] = "Recon"
    eight_classes["VulnerabilityScan"] = "Recon"
    eight_classes["Recon-HostDiscovery"] = "Recon"

    eight_classes["DNS_Spoofing"] = "Spoofing"
    eight_classes["MITM-ArpSpoofing"] = "Spoofing"

    eight_classes["BrowserHijacking"] = "Web"
    eight_classes["Backdoor_Malware"] = "Web"
    eight_classes["XSS"] = "Web"
    eight_classes["Uploading_Attack"] = "Web"
    eight_classes["SqlInjection"] = "Web"
    eight_classes["CommandInjection"] = "Web"

    eight_classes["DictionaryBruteForce"] = "BruteForce"

    training_data = training_data.with_columns(
        pl.col(y_column).replace(eight_classes, default=-1)
    )
    test_data = test_data.with_columns(
        pl.col(y_column).replace(eight_classes, default=-1)
    )

### 2 Classes

In [None]:
if NO_CLASSES == 2:
    two_classes = {}

    two_classes["BenignTraffic"] = "Benign"

    two_classes["DDoS-RSTFINFlood"] = "Attack"
    two_classes["DDoS-PSHACK_Flood"] = "Attack"
    two_classes["DDoS-SYN_Flood"] = "Attack"
    two_classes["DDoS-UDP_Flood"] = "Attack"
    two_classes["DDoS-TCP_Flood"] = "Attack"
    two_classes["DDoS-ICMP_Flood"] = "Attack"
    two_classes["DDoS-SynonymousIP_Flood"] = "Attack"
    two_classes["DDoS-ACK_Fragmentation"] = "Attack"
    two_classes["DDoS-UDP_Fragmentation"] = "Attack"
    two_classes["DDoS-ICMP_Fragmentation"] = "Attack"
    two_classes["DDoS-SlowLoris"] = "Attack"
    two_classes["DDoS-HTTP_Flood"] = "Attack"

    two_classes["DoS-UDP_Flood"] = "Attack"
    two_classes["DoS-SYN_Flood"] = "Attack"
    two_classes["DoS-TCP_Flood"] = "Attack"
    two_classes["DoS-HTTP_Flood"] = "Attack"

    two_classes["Mirai-greeth_flood"] = "Attack"
    two_classes["Mirai-greip_flood"] = "Attack"
    two_classes["Mirai-udpplain"] = "Attack"

    two_classes["Recon-PingSweep"] = "Attack"
    two_classes["Recon-OSScan"] = "Attack"
    two_classes["Recon-PortScan"] = "Attack"
    two_classes["VulnerabilityScan"] = "Attack"
    two_classes["Recon-HostDiscovery"] = "Attack"

    two_classes["DNS_Spoofing"] = "Attack"
    two_classes["MITM-ArpSpoofing"] = "Attack"

    two_classes["BrowserHijacking"] = "Attack"
    two_classes["Backdoor_Malware"] = "Attack"
    two_classes["XSS"] = "Attack"
    two_classes["Uploading_Attack"] = "Attack"
    two_classes["SqlInjection"] = "Attack"
    two_classes["CommandInjection"] = "Attack"

    two_classes["DictionaryBruteForce"] = "Attack"

    training_data = training_data.with_columns(
        pl.col(y_column).replace(two_classes, default=-1)
    )
    test_data = test_data.with_columns(
        pl.col(y_column).replace(two_classes, default=-1)
    )

## Classes

### Separando as colunas

In [None]:
X_train = training_data.select(X_columns)
y_train = training_data.select(y_column)
X_test = test_data.select(X_columns)
y_test = test_data.select(y_column).to_numpy()

### Desvio Padrão

In [None]:
std_devs = X_train.std()
cols_to_keep = [col for col in X_train.columns if std_devs[col][0] > 0.0]
X_train_filtered = X_train.select(cols_to_keep)
X_test_filtered = X_test.select(cols_to_keep)

### Correlação

In [None]:
sns.set_theme(rc={"figure.figsize": (75, 25)})

corr = X_train_filtered.corr()
sns.heatmap(
    data=corr,
    cbar=False,
    cmap="coolwarm",
    fmt=".3f",
    mask=np.triu(corr),
    annot=True,
    xticklabels=corr.columns,
    yticklabels=corr.columns,
)

In [None]:
if FEATURE_SELECTION:
    correlated_columns = correlation(X_train_filtered, 0.9)

    X_train = X_train_filtered.drop(correlated_columns)
    X_test = X_test_filtered.drop(correlated_columns)

### Balanceando o dataset

#### Distribuição antes do balanceamento

In [None]:
y_train.group_by("label").len()

In [None]:
sns.barplot(
    data=y_train.group_by("label").len(),
    x="label",
    y="len",
    hue="label",
)

#### SMOTE

In [None]:
sm = SMOTE(random_state=42)

pd_X_resampled, pd_y_resampled = sm.fit_resample(
    X_train.to_pandas(), y_train.to_pandas()
)
X_resampled, y_resampled = pl.from_pandas(pd_X_resampled), pl.from_pandas(
    pd_y_resampled
)

#### Distribuição após o balanceamento

In [None]:
y_resampled.group_by("label").len()

In [None]:
sns.barplot(
    data=y_resampled.group_by("label").len(),
    x="label",
    y="len",
    hue="label",
)

### Normalizando o dataset

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Testando os algoritmos

Aqui serão executados os algoritmos de classificação Logistic Regression, Perceptron, Adaboost e Random Forest, presentes no trabalho de Neto, E. C. P. *et al.*

In [None]:
data = ModelDataset(
    x_train=X_train_scaled,
    y_train=y_resampled.to_numpy(),
    x_test=X_test_scaled,
    y_test=y_test,
)

### Logistic Regression

In [None]:
logistic_regression_results = evaluate_model(
    model=LogisticRegression(n_jobs=-1, max_iter=10000), data=data
)
save_results(logistic_regression_results)

### Perceptron

In [None]:
perceptron_results = evaluate_model(model=Perceptron(), data=data)
save_results(perceptron_results)

### AdaBoost

In [None]:
adaboost_results = evaluate_model(model=AdaBoostClassifier(), data=data)
save_results(adaboost_results)

### DNN

In [None]:
dnn_results = evaluate_model(
    model=MLPClassifier(
        hidden_layer_sizes=(100,),
        activation="relu",
        solver="adam",
        random_state=42,
    ),
    data=data,
)
save_results(dnn_results)

### Random Forest

In [None]:
random_forest_results = evaluate_model(
    model=RandomForestClassifier(), data=data
)
save_results(random_forest_results)

# Análise dos Resultados

In [None]:
df = pd.read_csv(RESULTS_FILE)
df.drop_duplicates(inplace=True)
df["model"] = df["model"].replace(
    {
        "LogisticRegression": "Regressão Logística",
        "Perceptron": "Perceptron",
        "AdaBoostClassifier": "AdaBoost",
        "MLPClassifier": "DNN",
        "RandomForestClassifier": "Random Forest",
    }
)
df["duration_training"] = df["duration_training"] / 60
df = df.rename(
    columns={
        "accuracy_score": "Acurácia",
        "precision_score": "Precisão",
        "recall_score": "Recall",
        "f1_score": "F1",
        "duration_training": "Tempo de Treinamento",
    }
)
df.set_index("model", inplace=True)

features_groupby = df.groupby("num_features")
metrics = ["Acurácia", "Precisão", "Recall", "F1"]

In [None]:
fig, axes = plt.subplots(
    len(metrics),
    len(features_groupby),
    figsize=(len(features_groupby) * 5, len(metrics) * 4),
)

for i, metric in enumerate(metrics):
    for j, (num_classes, df_feature) in enumerate(features_groupby):
        df_false = df_feature[~df_feature["feature_selection"]]
        df_true = df_feature[df_feature["feature_selection"]]
        
        ax = axes[i, j]
    
        sns.barplot(
            data=df_false,
            x="model",
            y=df_false[metric],
            hue="model",
            ax=ax,
            alpha=0.8,
        )
        new_ax = ax.twiny()
        sns.lineplot(
            data=df_true,
            x="model",
            y=df_true[metric],
            marker="o",
            markeredgewidth=0.5,
            color="slategray",
            ax=new_ax,
        )
        ax.grid(False)
        new_ax.grid(False)

        ax.set_xlabel("")
        ax.tick_params(axis="both", which="major", labelsize=7)
        ax.tick_params(axis="both", which="minor", labelsize=5)

        new_ax.get_xaxis().set_visible(False)

        if i == 0:
            ax.set_title(f"{num_classes} Classes")

        if j == 0:
            ax.set_ylabel(metric, size="large")
        else:
            ax.set_ylabel("")

plt.suptitle("Comparação das métricas dos modelos")
plt.tight_layout(pad=1.5)
plt.show()

In [None]:
fig, axes = plt.subplots(
    1,
    len(features_groupby),
    figsize=(len(features_groupby) * 5, 4),
)

for i, (num_classes, df_feature) in enumerate(features_groupby):
    df_false = df_feature[~df_feature["feature_selection"]]
    df_true = df_feature[df_feature["feature_selection"]]
    ax = axes[i]
    sns.barplot(
        data=df_false,
        x="model",
        y=df_false["Tempo de Treinamento"],
        hue="model",
        ax=ax,
        alpha=0.8,
        errorbar=None,
    )
    new_ax = ax.twiny()
    sns.lineplot(
        data=df_true,
        x="model",
        y=df_true["Tempo de Treinamento"],
        marker="o",
        markeredgewidth=0.5,
        color="slategray",
        ax=new_ax,
        errorbar=None,
    )
    ax.grid(False)
    new_ax.grid(False)

    ax.set_xlabel("")
    ax.tick_params(axis="both", which="major", labelsize=8)
    ax.tick_params(axis="both", which="minor", labelsize=6)

    new_ax.get_xaxis().set_visible(False)

    ax.set_title(f"{num_classes} Classes")

    if i == 0:
        ax.set_ylabel("Tempo de Treinamento (min)", size="large")
    else:
        ax.set_ylabel("")

plt.suptitle("Comparação do tempo de treinamento dos modelos")
plt.tight_layout(pad=1.5)
plt.show()