In [1]:
BASE_FILENAME = '2025-06-13/Multiclass/NIDS_NF-BoT-IoT_Multiclass'
TARGET_COL = 'label'
TEST_SIZE = 0.2
TIME_LIMIT = 60
EVAL_METRIC = 'f1_weighted'
PRESET = 'medium_quality'
N_FOLDS = 5
RANDOM_STATE = 42

In [2]:
import json
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Step 1: Load parquet and metadata
df = pd.read_parquet(f'{BASE_FILENAME}.parquet')#.dropna()

with open(f'{BASE_FILENAME}.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

# Step 2: Apply dtypes from metadata to df
for col, dtype in metadata["dtypes"].items():
    df[col] = df[col].astype(dtype)

# Step 3: Split the data into 80% train (for CV) and 20% test (holdout)
from autogluon.tabular import TabularDataset
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df[TARGET_COL])

train_data, test_data = TabularDataset(train_df), TabularDataset(test_df)

# Step 4: Print dtypes
print(f'{"column":<40}', f'{"metadata":<20}', f'{"train_data":<20}', f'{"test_data":<20}')
for col in df.columns:
    print(f'{col:<40}', f'{str(metadata["dtypes"][col]):<20}', f'{str(train_data[col].dtype):<20}', f'{str(test_data[col].dtype):<20}')

column                                   metadata             train_data           test_data           
L4_SRC_PORT                              int32                int32                int32               
L4_DST_PORT                              int32                int32                int32               
PROTOCOL                                 category             category             category            
L7_PROTO                                 float32              float32              float32             
IN_BYTES                                 int32                int32                int32               
OUT_BYTES                                int32                int32                int32               
IN_PKTS                                  int32                int32                int32               
OUT_PKTS                                 int32                int32                int32               
TCP_FLAGS                                int32                in

In [3]:
from autogluon.tabular import TabularPredictor

problem_type = 'binary' if df['label'].nunique() == 2 else 'multiclass'

predictor = TabularPredictor(
    label=TARGET_COL,
    problem_type=problem_type,
    eval_metric=EVAL_METRIC
)

predictor.fit(
    train_data=train_data,
    time_limit=TIME_LIMIT,
    num_bag_folds=N_FOLDS,   # K in K-Fold Cross-Validation
    num_bag_sets=1,          # how many full sets of K models to train
    num_stack_levels=1,      # optional: for stacking
    presets=PRESET
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250616_192950"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Mon Apr 21 17:08:54 UTC 2025
CPU Count:          24
Memory Avail:       91.44 GB / 94.29 GB (97.0%)
Disk Space Avail:   576.96 GB / 1006.85 GB (57.3%)
Presets specified: ['medium_quality']
  import pkg_resources
Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "/home/automl/git/iot-threat-classifier/AutogluonModels/ag-20250616_192950"
Train Data Rows:    476300
Train Data Columns: 10
Label Column:       label
Problem Type:       multiclass
Preprocessing data ...
Train Data Class Count: 5
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    93605.65 MB
	Train Data (Original)  Memory Usage: 16.81 MB (0.0% of available mem

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x707b09caf590>

In [20]:
leaderboard = predictor.leaderboard()

display(leaderboard)

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_BAG_L2,0.955417,f1_weighted,21.038926,29.197422,2.565855,10.310002,2,True,6
1,WeightedEnsemble_L3,0.955417,f1_weighted,21.097259,36.594769,0.058332,7.397347,3,True,8
2,WeightedEnsemble_L2,0.836685,f1_weighted,3.02776,22.245192,0.059624,5.02608,2,True,5
3,NeuralNetFastAI_BAG_L1,0.836639,f1_weighted,2.236587,13.816767,2.236587,13.816767,1,True,3
4,LightGBMXT_BAG_L1,0.815011,f1_weighted,0.73155,3.402345,0.73155,3.402345,1,True,4
5,KNeighborsUnif_BAG_L1,0.796114,f1_weighted,7.322782,0.839726,7.322782,0.839726,1,True,1
6,KNeighborsDist_BAG_L1,0.774135,f1_weighted,8.182153,0.828582,8.182153,0.828582,1,True,2
7,LightGBMXT_BAG_L2,0.694075,f1_weighted,18.569054,20.476804,0.095982,1.589384,2,True,7


In [13]:
base_eval = predictor.evaluate(data=test_data, display=True, detailed_report=True)

Evaluation: f1_weighted on test data: 0.9558348750976805
Evaluations on test data:
{
    "f1_weighted": 0.9558348750976805,
    "accuracy": 0.9556585709966744,
    "balanced_accuracy": 0.8832581967422815,
    "mcc": 0.8794498776616505
}
Detailed (per-class) classification report:
{
    "Benign": {
        "precision": 0.9459804338579328,
        "recall": 0.8057971014492754,
        "f1-score": 0.8702797886910585,
        "support": 2760.0
    },
    "DDoS": {
        "precision": 0.8668276374442794,
        "recall": 0.8295414148595806,
        "f1-score": 0.8477747502270663,
        "support": 11252.0
    },
    "DoS": {
        "precision": 0.8151529713278872,
        "recall": 0.9047111111111111,
        "f1-score": 0.8576002696326256,
        "support": 11250.0
    },
    "Reconnaissance": {
        "precision": 0.9848824849415377,
        "recall": 0.9816467616968452,
        "f1-score": 0.9832619612929505,
        "support": 93444.0
    },
    "Theft": {
        "precision": 0.9

In [6]:
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    average_precision_score, accuracy_score, matthews_corrcoef, log_loss,
    confusion_matrix
)
import numpy as np

# ------------------------------
# Setup
# ------------------------------
model_names = predictor.model_names()
y_true = test_data[TARGET_COL]

# Determine valid averaging strategies and ROC modes
if problem_type == 'binary':
    averages = ['binary', 'micro', 'macro', 'weighted']
    multi_class_args = [None]
    use_pos_label = 'Malign' if 'Malign' in y_true.unique() else None
else:
    averages = ['micro', 'macro', 'weighted']
    multi_class_args = ['ovo', 'ovr']
    use_pos_label = None  # not applicable in multiclass

# Fix key names: use 'binary' instead of 'None' for binary mode
roc_auc_keys = [
    (f'{mode}_{avg}' if mode else f'binary_{avg}')
    for mode in multi_class_args
    for avg in averages
]

# Initialize metric containers
metrics_scores = {
    'f1': {avg: [] for avg in averages},
    'precision': {avg: [] for avg in averages},
    'recall': {avg: [] for avg in averages},
    'pr_auc': {avg: [] for avg in averages},
    'roc_auc': {key: [] for key in roc_auc_keys},
    'log_loss': [],
    'accuracy': [],
    'mcc': [],
}

# ------------------------------
# Evaluation loop
# ------------------------------
conf_matrix_counts = []

for model in model_names:
    print(f"Evaluating model: {model}")
    y_pred = predictor.predict(test_data, model=model)
    y_proba = predictor.predict_proba(test_data, model=model)
    assert np.allclose(y_proba.sum(axis=1), 1.0, atol=1e-6)

    # --------------------------
    # Confusion matrix + derived counts
    # --------------------------
    classes = np.unique(y_true)
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    conf_row = {'confusion_matrix': cm.tolist()}  # Add raw matrix

    if problem_type == 'binary':
        try:
            pos_idx = list(classes).index("Malign")
            neg_idx = 1 - pos_idx
        except ValueError:
            pos_idx, neg_idx = 1, 0  # fallback

        tn = cm[neg_idx, neg_idx]
        fp = cm[neg_idx, pos_idx]
        fn = cm[pos_idx, neg_idx]
        tp = cm[pos_idx, pos_idx]

        conf_row.update({
            'global_tp': tp,
            'global_fp': fp,
            'global_tn': tn,
            'global_fn': fn
        })

    else:  # multiclass
        for i, label in enumerate(classes):
            tp = cm[i, i]
            fn = cm[i, :].sum() - tp
            fp = cm[:, i].sum() - tp
            tn = cm.sum() - (tp + fp + fn)

            conf_row.update({
                f'tp_{label}': tp,
                f'fp_{label}': fp,
                f'tn_{label}': tn,
                f'fn_{label}': fn
            })

    conf_matrix_counts.append(conf_row)

    # --------------------------
    # Multi-avg metrics: f1, precision, recall
    # --------------------------
    for avg in averages:
        for name, func in zip(['f1', 'precision', 'recall'],
                              [f1_score, precision_score, recall_score]):
            kwargs = {'average': avg, 'zero_division': 0}
            if avg == 'binary' and use_pos_label:
                kwargs['pos_label'] = use_pos_label
            try:
                score = func(y_true, y_pred, **kwargs)
            except ValueError:
                score = np.nan
            metrics_scores[name][avg].append(score)

    # --------------------------
    # ROC AUC: ovo and ovr (for multiclass)
    # --------------------------
    for mode in multi_class_args:
        for avg in averages:
            key = f'{mode}_{avg}' if mode else f'binary_{avg}'
            try:
                if problem_type == 'binary':
                    # Use only the Malign column for ROC AUC
                    pos_col = 'Malign' if 'Malign' in y_proba.columns else y_proba.columns[-1]
                    score = roc_auc_score(y_true, y_proba[pos_col], average=avg)
                else:
                    score = roc_auc_score(y_true, y_proba, average=avg, multi_class=mode)
            except ValueError:
                score = np.nan
            metrics_scores['roc_auc'][key].append(score)

    # --------------------------
    # PR AUC (Average Precision)
    # --------------------------
    for avg in averages:
        kwargs = {'average': avg}
        if use_pos_label and avg == 'binary':
            kwargs['pos_label'] = use_pos_label
        try:
            score = average_precision_score(y_true, y_proba, **kwargs)
        except ValueError:
            score = np.nan
        metrics_scores['pr_auc'][avg].append(score)

    # --------------------------
    # Flat metrics: log loss, accuracy, MCC
    # --------------------------
    try:
        metrics_scores['log_loss'].append(log_loss(y_true, y_proba))
    except ValueError:
        metrics_scores['log_loss'].append(np.nan)

    try:
        metrics_scores['accuracy'].append(accuracy_score(y_true, y_pred))
    except ValueError:
        metrics_scores['accuracy'].append(np.nan)

    try:
        metrics_scores['mcc'].append(matthews_corrcoef(y_true, y_pred))
    except ValueError:
        metrics_scores['mcc'].append(np.nan)

Evaluating model: KNeighborsUnif_BAG_L1
Evaluating model: KNeighborsDist_BAG_L1
Evaluating model: NeuralNetFastAI_BAG_L1
Evaluating model: LightGBMXT_BAG_L1
Evaluating model: WeightedEnsemble_L2
Evaluating model: NeuralNetFastAI_BAG_L2
Evaluating model: LightGBMXT_BAG_L2
Evaluating model: WeightedEnsemble_L3


In [7]:
# ------------------------------
# Save raw per-model scores
# ------------------------------
raw_data = {'model': model_names}

# Add multi-avg metric scores
for metric_name in ['f1', 'precision', 'recall', 'pr_auc']:
    for avg in metrics_scores[metric_name]:
        key = f'{metric_name}_{avg}'
        raw_data[key] = metrics_scores[metric_name][avg]

# Add ROC AUC (mode + average)
for key in metrics_scores['roc_auc']:
    raw_data[f'roc_auc_{key}'] = metrics_scores['roc_auc'][key]

# Add flat metrics
for metric_name in ['log_loss', 'accuracy', 'mcc']:
    raw_data[metric_name] = metrics_scores[metric_name]

# Add confusion matrix–derived counts (TP/FP/TN/FN)
for key in conf_matrix_counts[0].keys():
    raw_data[key] = [row.get(key, np.nan) for row in conf_matrix_counts]

# Create and display the DataFrame
df_raw = pd.DataFrame(raw_data)
display(df_raw)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted,pr_auc_micro,pr_auc_macro,pr_auc_weighted,roc_auc_ovo_micro,roc_auc_ovo_macro,roc_auc_ovo_weighted,roc_auc_ovr_micro,roc_auc_ovr_macro,roc_auc_ovr_weighted,log_loss,accuracy,mcc,confusion_matrix,tp_Benign,fp_Benign,tn_Benign,fn_Benign,tp_DDoS,fp_DDoS,tn_DDoS,fn_DDoS,tp_DoS,fp_DoS,tn_DoS,fn_DoS,tp_Reconnaissance,fp_Reconnaissance,tn_Reconnaissance,fn_Reconnaissance,tp_Theft,fp_Theft,tn_Theft,fn_Theft
0,KNeighborsUnif_BAG_L1,0.781392,0.584761,0.795679,0.781392,0.609998,0.812084,0.781392,0.565891,0.781392,0.914129,0.655794,0.852472,,0.879205,0.917835,0.971226,0.917845,0.942228,0.601456,0.781392,0.439584,"[[1890, 16, 9, 845, 0], [9, 2395, 7929, 917, 2], [5, 8645, 1679, 919, 2], [334, 4834, 1493, 86765, 18], [2, 33, 10, 9, 316]]",1890,350,115966,870,2395,13528,94296,8857,1679,9441,98385,9571,86765,2690,22942,6679,316,22,118684,54
1,KNeighborsDist_BAG_L1,0.757592,0.549085,0.773895,0.757592,0.573368,0.791274,0.757592,0.527882,0.757592,0.704454,0.547869,0.789873,,0.717916,0.770267,0.862634,0.742244,0.823862,3.628245,0.757592,0.378698,"[[2070, 14, 7, 669, 0], [4, 611, 9591, 1045, 1], [3, 9625, 576, 1045, 1], [420, 5317, 1065, 86637, 5], [0, 42, 6, 5, 317]]",2070,427,115889,690,611,14998,92826,10641,576,10669,97157,10674,86637,2764,22868,6807,317,7,118699,53
2,NeuralNetFastAI_BAG_L1,0.835131,0.468028,0.82958,0.835131,0.74607,0.836774,0.835131,0.440402,0.835131,0.953297,0.631809,0.883739,,0.932969,0.948352,0.986176,0.962531,0.966008,0.293863,0.835131,0.537093,"[[1180, 1, 0, 1579, 0], [5, 6143, 2996, 2108, 0], [0, 6126, 3035, 2089, 0], [11, 2839, 1510, 89084, 0], [0, 47, 0, 321, 2]]",1180,16,116300,1580,6143,9013,98811,5109,3035,4506,103320,8215,89084,6097,19535,4360,2,0,118706,368
3,LightGBMXT_BAG_L1,0.836147,0.523959,0.812853,0.836147,0.740551,0.805096,0.836147,0.449919,0.836147,0.954673,0.717653,0.887027,,0.939131,0.955495,0.986585,0.965679,0.968338,0.313131,0.836147,0.486159,"[[1309, 0, 0, 1451, 0], [10, 2278, 4228, 4735, 1], [5, 2421, 4019, 4805, 0], [23, 1352, 196, 91873, 0], [1, 0, 0, 283, 86]]",1309,39,116277,1451,2278,3773,104051,8974,4019,4424,103402,7231,91873,11274,14358,1571,86,1,118705,284
4,WeightedEnsemble_L2,0.835718,0.464794,0.826938,0.835718,0.746579,0.832756,0.835718,0.435789,0.835718,0.953653,0.643574,0.884476,,0.934096,0.949545,0.986297,0.96366,0.96662,0.293668,0.835718,0.531251,"[[1197, 1, 0, 1562, 0], [9, 6102, 2689, 2452, 0], [0, 6096, 2695, 2459, 0], [11, 2790, 1125, 89518, 0], [0, 47, 0, 321, 2]]",1197,20,116296,1563,6102,8934,98890,5150,2695,3814,104012,8555,89518,6794,18838,3926,2,0,118706,368
5,NeuralNetFastAI_BAG_L2,0.955659,0.899852,0.955835,0.955659,0.920772,0.956809,0.955659,0.883258,0.955659,0.995811,0.964428,0.990105,,0.994828,0.996018,0.99889,0.996948,0.997033,0.080445,0.955659,0.87945,"[[2224, 0, 0, 535, 1], [9, 9334, 1441, 468, 0], [6, 680, 10178, 386, 0], [112, 745, 856, 91729, 2], [0, 9, 11, 19, 331]]",2224,127,116189,536,9334,1434,106390,1918,10178,2308,105518,1072,91729,1408,24224,1715,331,3,118703,39
6,LightGBMXT_BAG_L2,0.787573,0.3669,0.694152,0.787573,0.355648,0.62066,0.787573,0.383777,0.787573,0.95379,0.957134,0.988724,,0.987732,0.993458,0.985055,0.993943,0.996005,0.599865,0.787573,0.110859,"[[0, 0, 0, 2760, 0], [0, 0, 0, 11252, 0], [0, 0, 0, 11250, 0], [0, 0, 0, 93441, 3], [0, 0, 0, 30, 340]]",0,0,116316,2760,0,0,107824,11252,0,0,107826,11250,93441,25292,340,3,340,3,118703,30
7,WeightedEnsemble_L3,0.955659,0.899852,0.955835,0.955659,0.920772,0.956809,0.955659,0.883258,0.955659,0.995811,0.964428,0.990105,,0.994828,0.996018,0.99889,0.996948,0.997033,0.080445,0.955659,0.87945,"[[2224, 0, 0, 535, 1], [9, 9334, 1441, 468, 0], [6, 680, 10178, 386, 0], [112, 745, 856, 91729, 2], [0, 9, 11, 19, 331]]",2224,127,116189,536,9334,1434,106390,1918,10178,2308,105518,1072,91729,1408,24224,1715,331,3,118703,39


In [8]:
# Identify columns to exclude
exclude_cols = df_raw.filter(regex=r'^[tf][pn]_').columns.tolist()
exclude_cols.append('confusion_matrix')

# Drop them
df_metrics = df_raw.drop(columns=exclude_cols)
display(df_metrics)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted,pr_auc_micro,pr_auc_macro,pr_auc_weighted,roc_auc_ovo_micro,roc_auc_ovo_macro,roc_auc_ovo_weighted,roc_auc_ovr_micro,roc_auc_ovr_macro,roc_auc_ovr_weighted,log_loss,accuracy,mcc
0,KNeighborsUnif_BAG_L1,0.781392,0.584761,0.795679,0.781392,0.609998,0.812084,0.781392,0.565891,0.781392,0.914129,0.655794,0.852472,,0.879205,0.917835,0.971226,0.917845,0.942228,0.601456,0.781392,0.439584
1,KNeighborsDist_BAG_L1,0.757592,0.549085,0.773895,0.757592,0.573368,0.791274,0.757592,0.527882,0.757592,0.704454,0.547869,0.789873,,0.717916,0.770267,0.862634,0.742244,0.823862,3.628245,0.757592,0.378698
2,NeuralNetFastAI_BAG_L1,0.835131,0.468028,0.82958,0.835131,0.74607,0.836774,0.835131,0.440402,0.835131,0.953297,0.631809,0.883739,,0.932969,0.948352,0.986176,0.962531,0.966008,0.293863,0.835131,0.537093
3,LightGBMXT_BAG_L1,0.836147,0.523959,0.812853,0.836147,0.740551,0.805096,0.836147,0.449919,0.836147,0.954673,0.717653,0.887027,,0.939131,0.955495,0.986585,0.965679,0.968338,0.313131,0.836147,0.486159
4,WeightedEnsemble_L2,0.835718,0.464794,0.826938,0.835718,0.746579,0.832756,0.835718,0.435789,0.835718,0.953653,0.643574,0.884476,,0.934096,0.949545,0.986297,0.96366,0.96662,0.293668,0.835718,0.531251
5,NeuralNetFastAI_BAG_L2,0.955659,0.899852,0.955835,0.955659,0.920772,0.956809,0.955659,0.883258,0.955659,0.995811,0.964428,0.990105,,0.994828,0.996018,0.99889,0.996948,0.997033,0.080445,0.955659,0.87945
6,LightGBMXT_BAG_L2,0.787573,0.3669,0.694152,0.787573,0.355648,0.62066,0.787573,0.383777,0.787573,0.95379,0.957134,0.988724,,0.987732,0.993458,0.985055,0.993943,0.996005,0.599865,0.787573,0.110859
7,WeightedEnsemble_L3,0.955659,0.899852,0.955835,0.955659,0.920772,0.956809,0.955659,0.883258,0.955659,0.995811,0.964428,0.990105,,0.994828,0.996018,0.99889,0.996948,0.997033,0.080445,0.955659,0.87945


In [9]:
df_conf_counts = df_raw[['model'] + df_raw.filter(regex='(_[tf][pn])|([tf][pn]_)').columns.tolist()]
display(df_conf_counts)

Unnamed: 0,model,tp_Benign,fp_Benign,tn_Benign,fn_Benign,tp_DDoS,fp_DDoS,tn_DDoS,fn_DDoS,tp_DoS,fp_DoS,tn_DoS,fn_DoS,tp_Reconnaissance,fp_Reconnaissance,tn_Reconnaissance,fn_Reconnaissance,tp_Theft,fp_Theft,tn_Theft,fn_Theft
0,KNeighborsUnif_BAG_L1,1890,350,115966,870,2395,13528,94296,8857,1679,9441,98385,9571,86765,2690,22942,6679,316,22,118684,54
1,KNeighborsDist_BAG_L1,2070,427,115889,690,611,14998,92826,10641,576,10669,97157,10674,86637,2764,22868,6807,317,7,118699,53
2,NeuralNetFastAI_BAG_L1,1180,16,116300,1580,6143,9013,98811,5109,3035,4506,103320,8215,89084,6097,19535,4360,2,0,118706,368
3,LightGBMXT_BAG_L1,1309,39,116277,1451,2278,3773,104051,8974,4019,4424,103402,7231,91873,11274,14358,1571,86,1,118705,284
4,WeightedEnsemble_L2,1197,20,116296,1563,6102,8934,98890,5150,2695,3814,104012,8555,89518,6794,18838,3926,2,0,118706,368
5,NeuralNetFastAI_BAG_L2,2224,127,116189,536,9334,1434,106390,1918,10178,2308,105518,1072,91729,1408,24224,1715,331,3,118703,39
6,LightGBMXT_BAG_L2,0,0,116316,2760,0,0,107824,11252,0,0,107826,11250,93441,25292,340,3,340,3,118703,30
7,WeightedEnsemble_L3,2224,127,116189,536,9334,1434,106390,1918,10178,2308,105518,1072,91729,1408,24224,1715,331,3,118703,39


In [10]:
df_conf_matrix = df_raw[['model', 'confusion_matrix']]
display(df_conf_matrix)

Unnamed: 0,model,confusion_matrix
0,KNeighborsUnif_BAG_L1,"[[1890, 16, 9, 845, 0], [9, 2395, 7929, 917, 2], [5, 8645, 1679, 919, 2], [334, 4834, 1493, 86765, 18], [2, 33, 10, 9, 316]]"
1,KNeighborsDist_BAG_L1,"[[2070, 14, 7, 669, 0], [4, 611, 9591, 1045, 1], [3, 9625, 576, 1045, 1], [420, 5317, 1065, 86637, 5], [0, 42, 6, 5, 317]]"
2,NeuralNetFastAI_BAG_L1,"[[1180, 1, 0, 1579, 0], [5, 6143, 2996, 2108, 0], [0, 6126, 3035, 2089, 0], [11, 2839, 1510, 89084, 0], [0, 47, 0, 321, 2]]"
3,LightGBMXT_BAG_L1,"[[1309, 0, 0, 1451, 0], [10, 2278, 4228, 4735, 1], [5, 2421, 4019, 4805, 0], [23, 1352, 196, 91873, 0], [1, 0, 0, 283, 86]]"
4,WeightedEnsemble_L2,"[[1197, 1, 0, 1562, 0], [9, 6102, 2689, 2452, 0], [0, 6096, 2695, 2459, 0], [11, 2790, 1125, 89518, 0], [0, 47, 0, 321, 2]]"
5,NeuralNetFastAI_BAG_L2,"[[2224, 0, 0, 535, 1], [9, 9334, 1441, 468, 0], [6, 680, 10178, 386, 0], [112, 745, 856, 91729, 2], [0, 9, 11, 19, 331]]"
6,LightGBMXT_BAG_L2,"[[0, 0, 0, 2760, 0], [0, 0, 0, 11252, 0], [0, 0, 0, 11250, 0], [0, 0, 0, 93441, 3], [0, 0, 0, 30, 340]]"
7,WeightedEnsemble_L3,"[[2224, 0, 0, 535, 1], [9, 9334, 1441, 468, 0], [6, 680, 10178, 386, 0], [112, 745, 856, 91729, 2], [0, 9, 11, 19, 331]]"


In [11]:
# ------------------------------
# Summary generation
# ------------------------------
def summarize_scores(metric_name, avg_label, scores):
    arr = np.array(scores)
    return {
        'metric': metric_name,
        'average': avg_label,
        'min': np.nanmin(arr),
        'max': np.nanmax(arr),
        'mean': np.nanmean(arr),
        'median': np.nanmedian(arr),
        'std': np.nanstd(arr)
    }

summary = []

# Summarize metrics with averaging
for metric_name in ['f1', 'precision', 'recall', 'pr_auc']:
    for avg, scores in metrics_scores[metric_name].items():
        summary.append(summarize_scores(metric_name, avg, scores))

# Summarize ROC AUC (with mode + average)
for key, scores in metrics_scores['roc_auc'].items():
    summary.append(summarize_scores('roc_auc', key, scores))

# Summarize non-averaged metrics
for metric_name in ['log_loss', 'accuracy', 'mcc']:
    summary.append(summarize_scores(metric_name, 'none', metrics_scores[metric_name]))

# Convert to DataFrame and print
df_summary = pd.DataFrame(summary)
print(df_summary.to_string(index=False))

   metric      average      min      max     mean   median      std
       f1        micro 0.757592 0.955659 0.843109 0.835424 0.070452
       f1        macro 0.366900 0.899852 0.594654 0.536522 0.186492
       f1     weighted 0.694152 0.955835 0.830596 0.819896 0.082727
precision        micro 0.757592 0.955659 0.843109 0.835424 0.070452
precision        macro 0.355648 0.920772 0.701720 0.743311 0.175395
precision     weighted 0.620660 0.956809 0.826533 0.822420 0.098984
   recall        micro 0.757592 0.955659 0.843109 0.835424 0.070452
   recall        macro 0.383777 0.883258 0.571272 0.488901 0.187732
   recall     weighted 0.757592 0.955659 0.843109 0.835424 0.070452
   pr_auc        micro 0.704454 0.995811 0.928202 0.953722 0.088060
   pr_auc        macro 0.547869 0.964428 0.760336 0.686723 0.162050
   pr_auc     weighted 0.789873 0.990105 0.908315 0.885751 0.069478
  roc_auc    ovo_micro      NaN      NaN      NaN      NaN      NaN
  roc_auc    ovo_macro 0.717916 0.994828 0.92258

  'min': np.nanmin(arr),
  'max': np.nanmax(arr),
  'mean': np.nanmean(arr),
  'median': np.nanmedian(arr),
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [17]:
base_eval

{'f1_weighted': 0.9558348750976805,
 'accuracy': 0.9556585709966744,
 'balanced_accuracy': np.float64(0.8832581967422815),
 'mcc': np.float64(0.8794498776616505),
 'confusion_matrix':                 Benign  DDoS    DoS  Reconnaissance  Theft
 Benign            2224     0      0             535      1
 DDoS                 9  9334   1441             468      0
 DoS                  6   680  10178             386      0
 Reconnaissance     112   745    856           91729      2
 Theft                0     9     11              19    331,
 'classification_report': {'Benign': {'precision': 0.9459804338579328,
   'recall': 0.8057971014492754,
   'f1-score': 0.8702797886910585,
   'support': 2760.0},
  'DDoS': {'precision': 0.8668276374442794,
   'recall': 0.8295414148595806,
   'f1-score': 0.8477747502270663,
   'support': 11252.0},
  'DoS': {'precision': 0.8151529713278872,
   'recall': 0.9047111111111111,
   'f1-score': 0.8576002696326256,
   'support': 11250.0},
  'Reconnaissance': {'p

In [18]:
def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(v) for v in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    elif isinstance(obj, (np.ndarray, pd.Series)):
        return obj.tolist()
    elif isinstance(obj, pd.DataFrame):
        return obj.to_dict(orient='records')
    else:
        return obj

In [23]:
import json

# Convert all DataFrames to dicts
all_metrics_dict = {
    "leaderboard": leaderboard.to_dict(orient="records"),
    "base_eval": make_json_serializable(base_eval),
    "model_names": model_names,
    "metrics": df_metrics.to_dict(orient="records"),
    "confusion_counts": df_conf_counts.to_dict(orient="records"),
    "confusion_matrix": df_conf_matrix.to_dict(orient="records"),
    "summary": df_summary.to_dict(orient="records")
}

# Save to a single JSON file
with open(f"{BASE_FILENAME}_results.json", "w") as f:
    json.dump(all_metrics_dict, f, indent=2)