# Desempenho de algoritmos de classificação na detecção de intrusão em redes de dispositivos IoT

## Importações

In [None]:
import glob
import time
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import polars as pl
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Variáveis globais

In [None]:
DATASET_DIRECTORY = "./CICIoT2023/"
GENERATE_DATASETS = False
EIGHT_CLASSES = True

## DTOs

In [None]:
@dataclass
class ModelDataset:
    x_train: np.ndarray
    y_train: np.ndarray
    x_test: pl.DataFrame
    y_test: np.ndarray

In [None]:
@dataclass
class Results:
    model: str
    accuracy_score: float
    precision_score: float
    recall_score: float
    f1_score: float
    train_num_rows: int
    test_num_rows: int
    num_features: int
    duration_training: float
    duration_testing: float

## Funções auxiliares

In [None]:
def sample_rows(
    group_by: pl.dataframe.group_by.GroupBy, percentage: float
) -> pl.DataFrame:
    dfs = []
    for _, df in group_by:
        if len(df) * percentage <= 1:
            dfs.append(df.sample(n=1))
            continue
        dfs.append(df.sample(fraction=percentage, seed=42))

    return pl.concat(dfs)


def generate_dataframe(file_list: list, percentage: float) -> pl.DataFrame:
    dfs = []
    for file in file_list:
        df = pl.read_csv(file)
        dfs.append(
            sample_rows(
                group_by=df.group_by(["label"], maintain_order=True),
                percentage=percentage,
            )
        )
    return pl.concat(dfs)

In [None]:
def evaluate_model(model, data: ModelDataset):
    start_training = time.time()
    model.fit(data.x_train, data.y_train.ravel())
    end_training = time.time()

    start_testing = time.time()
    y_pred = model.predict(data.x_test)
    end_testing = time.time()

    return Results(
        model=type(model).__name__,
        accuracy_score=accuracy_score(data.y_test, y_pred),
        precision_score=precision_score(data.y_test, y_pred, average="macro"),
        recall_score=recall_score(data.y_test, y_pred, average="macro"),
        f1_score=f1_score(data.y_test, y_pred, average="macro"),
        train_num_rows=len(data.x_train),
        test_num_rows=len(data.x_test),
        num_features=len(np.unique(data.y_train)),
        duration_training=float(f"{end_training - start_training:.4f}"),
        duration_testing=float(f"{end_testing - start_testing:.4f}"),
    )

In [None]:
def save_results(results: Results):
    df = pl.DataFrame(vars(results))

    file_path = Path("./results.csv")

    if file_path.exists():
        df_already_exists = pl.read_csv(file_path)
        result = pl.concat([df_already_exists, df])
        result.write_csv(file_path)
    else:
        df.write_csv(file_path)

# Criando um dataset menor

In [None]:
data_dir = Path("./data/")
data_dir.mkdir(parents=True, exist_ok=True)

csv_file_paths = sorted(glob.glob(f"{DATASET_DIRECTORY}/*.csv"))

In [None]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.005).write_csv(
        file="./data/dataset_dev.csv"
    )

In [None]:
if GENERATE_DATASETS:
    generate_dataframe(file_list=df_sets, percentage=0.01).write_csv(
        file="./data/dataset_one_percent.csv"
    )

# IDS

## Dataset

In [None]:
df = pl.read_csv("./data/dataset_dev.csv")
training_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

### Separando Features e Label

In [None]:
X_columns = df.columns[:-1]
y_column = "label"

In [None]:
del df

### 8 Classes

In [None]:
if EIGHT_CLASSES:
    eight_classes = {}

    eight_classes["BenignTraffic"] = "Benign"

    eight_classes["DDoS-RSTFINFlood"] = "DDoS"
    eight_classes["DDoS-PSHACK_Flood"] = "DDoS"
    eight_classes["DDoS-SYN_Flood"] = "DDoS"
    eight_classes["DDoS-UDP_Flood"] = "DDoS"
    eight_classes["DDoS-TCP_Flood"] = "DDoS"
    eight_classes["DDoS-ICMP_Flood"] = "DDoS"
    eight_classes["DDoS-SynonymousIP_Flood"] = "DDoS"
    eight_classes["DDoS-ACK_Fragmentation"] = "DDoS"
    eight_classes["DDoS-UDP_Fragmentation"] = "DDoS"
    eight_classes["DDoS-ICMP_Fragmentation"] = "DDoS"
    eight_classes["DDoS-SlowLoris"] = "DDoS"
    eight_classes["DDoS-HTTP_Flood"] = "DDoS"

    eight_classes["DoS-UDP_Flood"] = "DoS"
    eight_classes["DoS-SYN_Flood"] = "DoS"
    eight_classes["DoS-TCP_Flood"] = "DoS"
    eight_classes["DoS-HTTP_Flood"] = "DoS"

    eight_classes["Mirai-greeth_flood"] = "Mirai"
    eight_classes["Mirai-greip_flood"] = "Mirai"
    eight_classes["Mirai-udpplain"] = "Mirai"

    eight_classes["Recon-PingSweep"] = "Recon"
    eight_classes["Recon-OSScan"] = "Recon"
    eight_classes["Recon-PortScan"] = "Recon"
    eight_classes["VulnerabilityScan"] = "Recon"
    eight_classes["Recon-HostDiscovery"] = "Recon"

    eight_classes["DNS_Spoofing"] = "Spoofing"
    eight_classes["MITM-ArpSpoofing"] = "Spoofing"

    eight_classes["BrowserHijacking"] = "Web"
    eight_classes["Backdoor_Malware"] = "Web"
    eight_classes["XSS"] = "Web"
    eight_classes["Uploading_Attack"] = "Web"
    eight_classes["SqlInjection"] = "Web"
    eight_classes["CommandInjection"] = "Web"

    eight_classes["DictionaryBruteForce"] = "BruteForce"

    training_data = training_data.with_columns(
        pl.col(y_column).replace(eight_classes, default=-1)
    )
    test_data = test_data.with_columns(
        pl.col(y_column).replace(eight_classes, default=-1)
    )

## Classes

### Separando as colunas

In [None]:
X_train = training_data.select(X_columns)
y_train = training_data.select(y_column)
X_test = test_data.select(X_columns)
y_test = test_data.select(y_column).to_numpy()

### Balanceando o dataset

In [None]:
training_data.group_by("label").len()

In [None]:
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)

pd_X_resampled, pd_y_resampled = ros.fit_resample(
    X_train.to_pandas(), y_train.to_pandas()
)
X_resampled, y_resampled = pl.from_pandas(pd_X_resampled), pl.from_pandas(
    pd_y_resampled
)

### Normalizando o dataset

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Testando os algoritmos

## Escopo original
Aqui serão executados os algoritmos de classificação Logistic Regression, Perceptron, Adaboost e Random Forest, presentes no trabalho de Neto, E. C. P. *et al.*

In [None]:
data = ModelDataset(
    x_train=X_train_scaled,
    y_train=y_resampled.to_numpy(),
    x_test=X_test_scaled,
    y_test=y_test,
)

### Logistic Regression

In [None]:
logistic_regression_results = evaluate_model(
    model=LogisticRegression(n_jobs=-1, max_iter=10000), data=data
)
save_results(logistic_regression_results)

### Perceptron

In [None]:
perceptron_results = evaluate_model(model=Perceptron(), data=data)
save_results(perceptron_results)

### AdaBoost

In [None]:
adaboost_results = evaluate_model(model=AdaBoostClassifier(), data=data)
save_results(adaboost_results)

### Random Forest

In [None]:
random_forest_results = evaluate_model(
    model=RandomForestClassifier(), data=data
)
save_results(random_forest_results)