# Multiclass classification on ToN http dataset using ft-transformer

### Imports

In [2]:
import os, sys
import numpy as np
import pandas as pd
import torch
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..','..'))
if project_root not in sys.path:
    sys.path.append(project_root)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from src.utilities.config_manager import ConfigManager
from src.utilities.io_handler import load_data


from pytorch_tabular import TabularModel #perché 2 diversi aiuto
#from pytorch_tabular.tabular_model import TabularModel

from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.feature_extractor import DeepFeatureExtractor

### Config

In [4]:
DATASET_CONFIG_PATH = '../../config/ton_config.json'

ConfigManager.load_config(DATASET_CONFIG_PATH)
paths_config = ConfigManager.get_section("paths")
data_cols_config = ConfigManager.get_section("data_columns")

DATA_PATH = '../../resources/dataset/http_ton.csv'
TARGET_COL = data_cols_config.get("target_category_column")
NUMERICAL_COLS = data_cols_config.get("numerical_cols")
CATEGORICAL_COLS = data_cols_config.get("categorical_cols")
RANDOM_STATE = 42    

### Dataset loading and splitting

In [6]:
df = load_data(DATA_PATH)

keep_cols = CATEGORICAL_COLS + NUMERICAL_COLS + [TARGET_COL]
df = df[keep_cols].copy() 

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=RANDOM_STATE, stratify=df[TARGET_COL])
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_STATE, stratify=temp_df[TARGET_COL])   


### Encoding target and class weights

In [4]:
y_le = LabelEncoder().fit(train_df[TARGET_COL])
for _df in (train_df, valid_df, test_df):
    _df[TARGET_COL] = y_le.transform(_df[TARGET_COL])

classes = np.unique(train_df[TARGET_COL].values)
cw = compute_class_weight('balanced', classes=classes, y=train_df[TARGET_COL])
cw = cw / cw.mean()  
cw = torch.tensor(cw, dtype=torch.float).to(
        "cuda" if torch.cuda.is_available() else "cpu"
    )

### Configuration blocks
Thanks pythorch_tabular 

In [5]:
data_config = DataConfig(
        target=[TARGET_COL],
        continuous_cols=NUMERICAL_COLS,
        categorical_cols=CATEGORICAL_COLS,
        num_workers=3
)

In [6]:
model_config = FTTransformerConfig(
    task="classification",
    input_embed_dim=32,
    num_heads=8,
    num_attn_blocks=6,
    learning_rate=1e-3,
    metrics_prob_input=[False, False],
    seed=RANDOM_STATE,
)

In [7]:
optimizer_config = OptimizerConfig(
    optimizer="Adam",
)

In [8]:
trainer_config = TrainerConfig(
    batch_size=2048,
    max_epochs=100,
    early_stopping="valid_loss",
    early_stopping_patience=10,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    deterministic=True,
    precision=32,
    load_best=False
)

### Model instance

In [9]:
ft_transformer_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

### Training

In [10]:
ft_transformer_model.fit(train=train_df, validation=valid_df)
ft_transformer_model.save_model('saved_models/ftt_run1')

Seed set to 42


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


/home/mat/tesi_triennale/neural-tabular-intrusion-detection-system/venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/mat/tesi_triennale/neural-tabular-intrusion-detection-system/notebooks/ToN_http/saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

### Checkpoint

In [7]:
# Nel caso ci sia qualche errore recuperare dalla cartella il modello salvato

checkpoint_path = 'saved_models/ftt_run1/'
ft_transformer_model = TabularModel.load_model(checkpoint_path)

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


### Evaluation

In [14]:
preds = ft_transformer_model.predict(test_df)
y_true = test_df[TARGET_COL].values
print(preds.head())
y_pred = preds['type_prediction']

acc = accuracy_score(y_true, y_pred)
bal_acc = balanced_accuracy_score(y_true, y_pred)

print("Test Accuracy: %.4f", acc)
print("Test Balanced Accuracy: %.4f", bal_acc)
print("Classification Report:\n%s", classification_report(y_true, y_pred))
print("Confusion Matrix:\n%s", confusion_matrix(y_true, y_pred))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

         type_0_probability  type_1_probability  type_2_probability  \
811803             0.000612            0.000017            0.010550   
2522179            0.000010            0.000001            0.000011   
704202             0.095018            0.000072            0.060333   
1625758            0.000600            0.000030            0.086837   
3361691            0.000011            0.000001            0.000011   

         type_3_probability  type_4_probability  type_5_probability  \
811803         5.999756e-06            0.097096            0.887180   
2522179        9.433523e-07            0.000007            0.000007   
704202         4.036905e-05            0.008197            0.113510   
1625758        9.725329e-06            0.056355            0.851587   
3361691        1.135149e-06            0.000006            0.000006   

         type_6_probability  type_7_probability  type_prediction  
811803             0.000101            0.004438                5  
2522179     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
%s               precision    recall  f1-score   support

           0       0.93      0.14      0.24     50614
           1       0.00      0.00      0.00        26
           2       0.59      0.68      0.63     50968
           3       0.00      0.00      0.00         7
           4       0.84      0.36      0.51      9197
           5       0.73      0.93      0.82    189474
           6       0.99      0.90      0.94      4686
           7       0.81      0.77      0.79    211140

    accuracy                           0.75    516112
   macro avg       0.61      0.47      0.49    516112
weighted avg       0.77      0.75      0.73    516112

Confusion Matrix:
%s [[  6936      0   2095      0     36  12297     19  29231]
 [     1      0      1      0      0      2      0     22]
 [    26      0  34833      0    270  14197      0   1642]
 [     1      0      1      0      0      0      0      5]
 [     8      0    934      0   3332   4670     15    238]
 [    2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Penultimate layer extraction 

In [None]:
dfe = DeepFeatureExtractor(ft_transformer_model, extract_keys=["backbone_features"])
emb_df = dfe.transform(test_df)
feature_cols = [c for c in emb_df.columns if "backbone_features" in c]
Z = emb_df[feature_cols].values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Output()

### Dimensionality reduction + clustering

In [5]:
Z_2d = PCA(n_components=2, random_state=RANDOM_STATE).fit_transform(Z)
n_clusters = len(np.unique(y_true))
labels = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE).fit_predict(Z_2d)


NameError: name 'Z' is not defined

### Plots

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(Z_2d[:, 0], Z_2d[:, 1], c=labels, alpha=0.7, cmap="viridis")
plt.title("FT‑Transformer backbone features (2‑D PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()