# Multiclass classification on ToN http dataset using ft-transformer

### Imports

In [9]:
import os, sys
import numpy as np
import pandas as pd
import torch
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..','..'))
if project_root not in sys.path:
    sys.path.append(project_root)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from src.utilities.config_manager import ConfigManager
from src.utilities.io_handler import load_data


from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.feature_extractor import DeepFeatureExtractor

### Config

In [10]:
DATASET_CONFIG_PATH = '../../config/ton_config.json'

ConfigManager.load_config(DATASET_CONFIG_PATH)
paths_config = ConfigManager.get_section("paths")
data_cols_config = ConfigManager.get_section("data_columns")

DATA_PATH = '../../resources/dataset/http_ton.csv'
TARGET_COL = data_cols_config.get("target_category_column")
NUMERICAL_COLS = data_cols_config.get("numerical_cols")
CATEGORICAL_COLS = data_cols_config.get("categorical_cols")
RANDOM_STATE = 42    

### Dataset loading and splitting

In [11]:
df = load_data(DATA_PATH)

keep_cols = CATEGORICAL_COLS + NUMERICAL_COLS + [TARGET_COL]
df = df[keep_cols].copy() 

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=RANDOM_STATE, stratify=df[TARGET_COL])
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_STATE, stratify=temp_df[TARGET_COL])   


### Encoding target and class weights

In [12]:
y_le = LabelEncoder().fit(train_df[TARGET_COL])
for _df in (train_df, valid_df, test_df):
    _df[TARGET_COL] = y_le.transform(_df[TARGET_COL])

classes = np.unique(train_df[TARGET_COL].values)
cw = compute_class_weight('balanced', classes=classes, y=train_df[TARGET_COL])
cw = cw / cw.mean()  
cw = torch.tensor(cw, dtype=torch.float).to(
        "cuda" if torch.cuda.is_available() else "cpu"
    )

### Configuration blocks
Thanks pythorch_tabular 

In [14]:
data_config = DataConfig(
        target=[TARGET_COL],
        continuous_cols=NUMERICAL_COLS,
        categorical_cols=CATEGORICAL_COLS,
)

In [None]:
model_config = FTTransformerConfig(
    task="classification",
    input_embed_dim=32,
    num_heads=8,
    num_attn_blocks=6,
    learning_rate=1e-3,
    metrics_prob_input=[False, False],
    seed=RANDOM_STATE,
)

In [17]:
optimizer_config = OptimizerConfig(
    optimizer="Adam",
)

In [19]:
trainer_config = TrainerConfig(
    batch_size=2048,
    max_epochs=100,
    early_stopping="valid_loss",
    early_stopping_patience=20,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    deterministic=True,
    precision=32,
)

### Model instance

In [20]:
ft_transformer_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

### Training

In [21]:
ft_transformer_model.fit(train=train_df, validation=valid_df)

Seed set to 42


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

AttributeError: module 'torchmetrics.functional' has no attribute 'balanced_accuracy'

### Evaluation

In [None]:
preds = ft_transformer_model.predict(test_df)
y_true = test_df[TARGET_COL].values
y_pred = preds["prediction"].values.astype(int)

acc = accuracy_score(y_true, y_pred)
bal_acc = balanced_accuracy_score(y_true, y_pred)

print("Test Accuracy: %.4f", acc)
print("Test Balanced Accuracy: %.4f", bal_acc)
print("Classification Report:\n%s", classification_report(y_true, y_pred))
print("Confusion Matrix:\n%s", confusion_matrix(y_true, y_pred))

### Penultimate layer extraction 

In [None]:
dfe = DeepFeatureExtractor(ft_transformer_model, extract_keys=["backbone_features"])
emb_df = dfe.transform(test_df)
feature_cols = [c for c in emb_df.columns if "backbone_features" in c]
Z = emb_df[feature_cols].values

### Dimensionality reduction + clustering

In [None]:
Z_2d = PCA(n_components=2, random_state=RANDOM_STATE).fit_transform(Z)
n_clusters = len(np.unique(y_true))
labels = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE).fit_predict(Z_2d)


### Plots

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(Z_2d[:, 0], Z_2d[:, 1], c=labels, alpha=0.7, cmap="viridis")
plt.title("FT‑Transformer backbone features (2‑D PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()