# Multiclass classification on ToN http dataset with encoder clustering

### Imports

In [None]:
import torch
import logging
import os, sys
import joblib

project_root = os.path.abspath(os.path.join(os.getcwd(), '..','..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utilities.config_manager import ConfigManager
from src.utilities.io_handler import load_data
from src.utilities.dataset_utils import *
from pytorch_tabnet.tab_model import TabNetClassifier
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score



### Configuration

In [2]:
DATASET_CONFIG_PATH = '../../config/ton_config.json'

ConfigManager.load_config(DATASET_CONFIG_PATH)
paths_config = ConfigManager.get_section("paths")
data_cols_config = ConfigManager.get_section("data_columns")

DATA_PATH = '../../resources/dataset/http_ton.csv'
OUTPUT_DIR = paths_config.get("output_dir")
TARGET_COL = data_cols_config.get("target_category_column")
NUMERICAL_COLS = data_cols_config.get("numerical_cols")
CATEGORICAL_COLS = data_cols_config.get("categorical_cols")
RANDOM_STATE = 42    

### Dataset loading and splitting

In [3]:
df = load_data(DATA_PATH)

keep_cols = CATEGORICAL_COLS + NUMERICAL_COLS + [TARGET_COL]
df = df[keep_cols].copy() 

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=RANDOM_STATE, stratify=df[TARGET_COL])
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_STATE, stratify=temp_df[TARGET_COL])   


### Preprocessing
- StandardScaler per le features numeriche; nonostante TabNet accetti features numeriche raw, normalizzare i dati aumenta le performance del modello

- LabelEncoder per le features categoriche; sarebbe meglio usare OrdinalEncoder, questo è un esperimento. inoltre mappiamo le categorie sconosciute al train set con '_UNK'

In [4]:
scaler = StandardScaler()
scaler.fit(train_df[NUMERICAL_COLS])
for _df in (train_df, valid_df, test_df):
    _df[NUMERICAL_COLS] = scaler.transform(_df[NUMERICAL_COLS])


categorical_dims, encoders = {}, {}
for col in CATEGORICAL_COLS:
    le = LabelEncoder().fit(train_df[col])
    le.classes_ = np.append(le.classes_, "_UNK")
    train_df[col] = le.transform(train_df[col])
    valid_df[col] = le.transform(
        valid_df[col].where(valid_df[col].isin(le.classes_), "_UNK")
    )
    test_df[col] = le.transform(
        test_df[col].where(test_df[col].isin(le.classes_), "_UNK")
    )
    categorical_dims[col] = len(le.classes_)   
    encoders[col] = le  
y_le = LabelEncoder().fit(train_df[TARGET_COL])
for _df in (train_df, valid_df, test_df):
    _df[TARGET_COL] = y_le.transform(_df[TARGET_COL])

### Some parameters

In [5]:
unused_feat = [ col for col in df.columns if col not in NUMERICAL_COLS + CATEGORICAL_COLS]

features = [ col for col in df.columns if col not in unused_feat+[TARGET_COL]] 

cat_idxs = [ i for i, f in enumerate(features) if f in CATEGORICAL_COLS]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in CATEGORICAL_COLS]

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Used features: {features}")
print(f"Unused features: {unused_feat}")

Used features: ['proto', 'conn_state', 'http_status_code', 'http_user_agent', 'http_method', 'duration', 'dst_bytes', 'missed_bytes', 'src_bytes', 'src_ip_bytes', 'src_pkts', 'dst_pkts', 'dst_ip_bytes', 'http_request_body_len', 'http_response_body_len']
Unused features: ['type']


In [6]:
X_train = train_df[features].values
y_train = train_df[TARGET_COL].values

X_valid = valid_df[features].values
y_valid = valid_df[TARGET_COL].values

X_test = test_df[features].values
y_test = test_df[TARGET_COL].values

### Model instance

In [None]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf.fit(X_train, y_train)
preds = rf.predict(X_valid)
print(balanced_accuracy_score(y_valid, preds))

