In [None]:
from utils import load_and_prepare_data, make_stratified_subset_dataloaders
from neural_network import NeuralNetwork
import torch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

dataset_path = 'datasets/http_ton.csv'

numerical_cols = [
        "duration",
        "dst_bytes",
        "missed_bytes",
        "src_bytes",
        "src_ip_bytes",
        "src_pkts",
        "dst_pkts",
        "dst_ip_bytes",
        "http_request_body_len",
        "http_response_body_len"

    ]

categorical_cols = [
        "proto",
        "conn_state",
        "http_status_code",
        "http_method",
        "http_orig_mime_types",
        "http_resp_mime_types",
    ]

target_col = 'type'
values_to_remove = {'type': ['mitm', 'dos']}
epochs = 15
fractions = [1.00, 0.75, 0.50, 0.25, 0.10, 0.05]
f1_scores = []

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
train_dataloader, valid_dataloader, test_dataloader, cat_cardinalities, cw, target_names, train_dataset, y_train = load_and_prepare_data(
    file_path=dataset_path,
    target_col=target_col,
    numerical_cols=numerical_cols,
    categorical_cols=categorical_cols,
    rows_to_remove=values_to_remove,
    batch_size=4096
)

In [None]:

dl_map = make_stratified_subset_dataloaders(
    train_dataset, y_train, fractions, batch_size=512, random_state=42
)

In [None]:
for frac, sub_loader in dl_map.items():
    
    model = NeuralNetwork(
        hidden_layers_sizes=[256, 256, 256], 
        cat_cardinalities=cat_cardinalities,
        embedding_dims=[min(50, (card + 1) // 2) for card in cat_cardinalities],
        num_numerical_features=len(numerical_cols),
        num_target_classes=6,
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.85, patience=5)

    model.fit(
        train_dataloader=train_dataloader,
        valid_dataloader=valid_dataloader,
        device=device,
        optimizer=optimizer,
        lr_scheduler=scheduler,
        epochs=epochs,
        weights = cw
    )

    y_pred = model.predict(test_dataloader, device)
    y_true = torch.cat([y for _, _, y in test_dataloader]).numpy()
    f1_scores.append((frac, f1_score(y_true, y_pred, average="weighted")))


In [None]:
fractions = [f for f, _ in f1_scores]
f1_vals   = [f1 for _, f1 in f1_scores]

plt.figure(figsize=(6,4))
plt.plot([int(f*100) for f in fractions], f1_vals, marker="o")
plt.xlabel("Percentuale di training samples (%)")
plt.ylabel("F1 score")
plt.title("Andamento F1 al variare della frazione di training")
plt.grid(True)
plt.show()