In [1]:
N_THREADS = 20
MAX_MEM_SIZE = "96G"
BASE_FILENAME = '2025-06-13/Multiclass/NIDS_NF-BoT-IoT_Multiclass'
TARGET_COL = 'label'
TRAIN_TEST_SPLIT = 0.8
MAX_RUNTIME_SECS = 60
BALANCE_CLASSES = True
N_FOLDS = 5
SEED = 42

In [2]:
import h2o

# Start the H2O cluster (locally)
h2o.init(nthreads=N_THREADS, max_mem_size=MAX_MEM_SIZE)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.27" 2025-04-15; OpenJDK Runtime Environment (build 11.0.27+6-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.27+6-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /home/automl/miniforge3/envs/h2o/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp6ddw0bc1
  JVM stdout: /tmp/tmp6ddw0bc1/h2o_automl_started_from_python.out
  JVM stderr: /tmp/tmp6ddw0bc1/h2o_automl_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,2 months and 19 days
H2O_cluster_name:,H2O_from_python_automl_ryaxbn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,96 Gb
H2O_cluster_total_cores:,24
H2O_cluster_allowed_cores:,24


In [3]:
import json
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Step 1: Load parquet and metadata
df = pd.read_parquet(f'{BASE_FILENAME}.parquet')
with open(f'{BASE_FILENAME}.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

# Step 2: Apply dtypes from metadata to df
for col, dtype in metadata["dtypes"].items():
    df[col] = df[col].astype(dtype)

# Step 3: Create H2OFrame (string columns will be converted)
hf = h2o.H2OFrame(df)

# Step 4: Apply dtypes from metadata to hf
for col, dtype in metadata["dtypes"].items():
    if dtype == "category":
        hf[col] = hf[col].asfactor()
        assert hf[col].isfactor()[0], f"{col} not converted to factor"

# Step 5: Print result
print(f'{"column":<40}', f'{"metadata":<20}', f'{"df":<20}', f'{"hf":<20}')
for col in hf.columns:
    print(f'{col:<40}', f'{str(metadata["dtypes"][col]):<20}', f'{str(df[col].dtype):<20}', f'{str(hf[col].dtype):<20}')

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
column                                   metadata             df                   hf                  
L4_SRC_PORT                              int32                int32                int32               
L4_DST_PORT                              int32                int32                int32               
PROTOCOL                                 category             category             <U0                 
L7_PROTO                                 float32              float32              float64             
IN_BYTES                                 int32                int32                int32               
OUT_BYTES                                int32                int32                int32               
IN_PKTS                                  int32                int32                int32               
OUT_PKTS                                 int32                int32      

In [4]:
# Step 1: Split the data into 80% train (for CV) and 20% test (holdout)
train, test = hf.split_frame(ratios=[TRAIN_TEST_SPLIT], seed=SEED)

# Step 2: Define features
features = [col for col in hf.columns if col != TARGET_COL]

# Step 3: Run AutoML on the 80% training split
from h2o.automl import H2OAutoML

aml = H2OAutoML(
    max_runtime_secs=MAX_RUNTIME_SECS,
    balance_classes=BALANCE_CLASSES,
    keep_cross_validation_predictions=True,
    keep_cross_validation_fold_assignment=True,
    nfolds=N_FOLDS,
    seed=SEED
)

aml.train(x=features, y=TARGET_COL, training_frame=train)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/3
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# GLM base models (used / total),1/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,
Custom metalearner hyperparameters,

Benign,DDoS,DoS,Reconnaissance,Theft,Error,Rate
167.0,0.0,0.0,59.0,0.0,0.2610619,59 / 226
2.0,921.0,0.0,24.0,0.0,0.0274551,26 / 947
1.0,908.0,0.0,17.0,0.0,1.0,926 / 926
31.0,504.0,0.0,7188.0,1.0,0.0693941,"536 / 7,724"
1.0,1.0,0.0,3.0,21.0,0.1923077,5 / 26
202.0,2334.0,0.0,7291.0,22.0,0.1575794,"1,552 / 9,849"

k,hit_ratio
1,0.8424206
2,0.9455783
3,0.9991878
4,0.9998985
5,1.0

Benign,DDoS,DoS,Reconnaissance,Theft,Error,Rate
8166.0,7.0,0.0,2836.0,0.0,0.2582433,"2,843 / 11,009"
27.0,44317.0,0.0,712.0,0.0,0.0164018,"739 / 45,056"
19.0,43723.0,8.0,1178.0,10.0,0.999822,"44,930 / 44,938"
1006.0,24691.0,2.0,348401.0,9.0,0.0687179,"25,708 / 374,109"
26.0,127.0,0.0,94.0,1238.0,0.16633,"247 / 1,485"
9244.0,112865.0,10.0,353221.0,1257.0,0.1562473,"74,467 / 476,597"

k,hit_ratio
1,0.8437527
2,0.9456102
3,0.9991565
4,0.9999077
5,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.843752,0.0014131,0.8425459,0.8422112,0.8447483,0.8437234,0.8455312
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.156248,0.0014131,0.1574541,0.1577888,0.1552518,0.1562766,0.1544688
err_count,14893.4,127.84679,15009.0,15045.0,14809.0,14853.0,14751.0
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logloss,0.3538177,0.0022927,0.3554872,0.3568,0.3533642,0.3521231,0.3513141
max_per_class_error,0.9998224,0.0001264,0.999781,0.9996669,1.0,0.9998883,0.9997759
mean_per_class_accuracy,0.6980732,0.0040439,0.6993122,0.692386,0.6977174,0.6973236,0.7036269
mean_per_class_error,0.3019268,0.0040439,0.3006878,0.307614,0.3022826,0.3026764,0.2963731


In [5]:
algo_type = aml.leader.algo
assert algo_type is not None, "Wrong algorithm type (should be != None)"

problem_type = aml.leader._model_json['output']['model_category']
if BASE_FILENAME.endswith('Binary'):
    assert problem_type == "Binomial", "Wrong problem type for Binary classification"
elif BASE_FILENAME.endswith('Multiclass'):
    assert problem_type == "Multinomial", "Wrong problem type for Multiclass classification"

In [6]:
leaderboard_df = aml.leaderboard.as_data_frame(use_multi_thread=True)
leaderboard_df.to_json(f'{BASE_FILENAME}_leaderboard.json', index=False, orient="records", indent=2)
display(leaderboard_df)

Export File progress: |██████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,model_id,mean_per_class_error,logloss,rmse,mse
0,StackedEnsemble_BestOfFamily_1_AutoML_1_202506...,0.301903,0.353818,0.342139,0.117059
1,XGBoost_1_AutoML_1_20250616_15814,0.367809,0.712113,0.504607,0.254628
2,GLM_1_AutoML_1_20250616_15814,0.8,0.706035,0.457564,0.209365
3,GBM_1_AutoML_1_20250616_15814,0.8,0.636751,0.438544,0.192321


In [7]:
cv_metrics_df = aml.leader.cross_validation_metrics_summary().as_data_frame()
cv_metrics_df = cv_metrics_df.rename(columns={"": "metric"})
cv_metrics_df.to_json(f'{BASE_FILENAME}_results_cv.json', index=False, orient="records", indent=2)
display(cv_metrics_df)

Unnamed: 0,metric,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.843752,0.001413,0.842546,0.842211,0.844748,0.843723,0.845531
1,aic,,0.0,,,,,
2,auc,,0.0,,,,,
3,err,0.156248,0.001413,0.157454,0.157789,0.155252,0.156277,0.154469
4,err_count,14893.4,127.84679,15009.0,15045.0,14809.0,14853.0,14751.0
5,loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,logloss,0.353818,0.002293,0.355487,0.3568,0.353364,0.352123,0.351314
7,max_per_class_error,0.999822,0.000126,0.999781,0.999667,1.0,0.999888,0.999776
8,mean_per_class_accuracy,0.698073,0.004044,0.699312,0.692386,0.697717,0.697324,0.703627
9,mean_per_class_error,0.301927,0.004044,0.300688,0.307614,0.302283,0.302676,0.296373


In [8]:
import warnings

class ProbabilitySumToOneWarningSuppressor:
    def __enter__(self):
        self._catch = warnings.catch_warnings()
        self._catch.__enter__()
        warnings.filterwarnings(
            "ignore",
            message="The y_prob values do not sum to one. Make sure to pass probabilities.",
            category=UserWarning,
            module="sklearn.metrics._classification"
        )
    def __exit__(self, exc_type, exc_val, exc_tb):
        self._catch.__exit__(exc_type, exc_val, exc_tb)

In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, log_loss, roc_auc_score, average_precision_score,
    matthews_corrcoef, confusion_matrix, classification_report
)
from sklearn.preprocessing import label_binarize

def compute_oos_metrics(y_true, y_pred, y_prob=None):
    results = {}

    # Global metrics (non-probabilistic)
    metric_funcs = {
        "accuracy": lambda: accuracy_score(y_true, y_pred),
        "precision_micro": lambda: precision_score(y_true, y_pred, average="micro", zero_division=0),
        "precision_macro": lambda: precision_score(y_true, y_pred, average="macro", zero_division=0),
        "precision_weighted": lambda: precision_score(y_true, y_pred, average="weighted", zero_division=0),
        "recall_micro": lambda: recall_score(y_true, y_pred, average="micro", zero_division=0),
        "recall_macro": lambda: recall_score(y_true, y_pred, average="macro", zero_division=0),
        "recall_weighted": lambda: recall_score(y_true, y_pred, average="weighted", zero_division=0),
        "f1_micro": lambda: f1_score(y_true, y_pred, average="micro", zero_division=0),
        "f1_macro": lambda: f1_score(y_true, y_pred, average="macro", zero_division=0),
        "f1_weighted": lambda: f1_score(y_true, y_pred, average="weighted", zero_division=0),
        "mcc": lambda: matthews_corrcoef(y_true, y_pred) if len(np.unique(y_true)) > 1 else None,
    }

    for name, func in metric_funcs.items():
        try:
            results[name] = func()
        except Exception as e:
            results[name] = f"error: {str(e)}"

    # Probabilistic metrics
    if y_prob is not None and len(np.unique(y_true)) > 1:
        classes = np.unique(y_true)

        if len(classes) == 2:
            # Binary case
            assert set(classes) == {"Benign", "Malign"}, f"Unexpected class labels: {classes}"
            try:
                pos_index = np.where(classes == 'Malign')[0][0]
                y_score = y_prob[:, pos_index]
            except Exception as e:
                y_score = None
                results["roc_auc"] = results["pr_auc"] = f"error: {str(e)}"

            prob_metrics = {
                "roc_auc": lambda: roc_auc_score(y_true, y_score),
                "pr_auc": lambda: average_precision_score(y_true, y_score, pos_label='Malign'),
                "log_loss": lambda: log_loss(y_true, y_prob)
            }
        else:
            # Multiclass case
            y_true_bin = label_binarize(y_true, classes=classes)
            prob_metrics = {
                "roc_auc_ovr_micro": lambda: roc_auc_score(y_true_bin, y_prob, average="micro", multi_class="ovr"),
                "roc_auc_ovr_macro": lambda: roc_auc_score(y_true_bin, y_prob, average="macro", multi_class="ovr"),
                "roc_auc_ovr_weighted": lambda: roc_auc_score(y_true_bin, y_prob, average="weighted", multi_class="ovr"),
                "pr_auc_micro": lambda: average_precision_score(y_true_bin, y_prob, average="micro"),
                "pr_auc_macro": lambda: average_precision_score(y_true_bin, y_prob, average="macro"),
                "pr_auc_weighted": lambda: average_precision_score(y_true_bin, y_prob, average="weighted"),
                "log_loss": lambda: log_loss(y_true, y_prob)
            }

        for name, func in prob_metrics.items():
            try:
                results[name] = func()
            except Exception as e:
                results[name] = f"error: {str(e)}"
    else:
        # If no probabilities or degenerate case
        for name in [
            "roc_auc", "pr_auc", "log_loss",
            "roc_auc_ovr_micro", "roc_auc_ovr_macro", "roc_auc_ovr_weighted",
            "pr_auc_micro", "pr_auc_macro", "pr_auc_weighted"
        ]:
            results[name] = None

    # Convert to DataFrame
    global_df = pd.DataFrame([results])

    # Per-class metrics
    report_dict = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    per_class_df = (
        pd.DataFrame(report_dict).T
        .reset_index()
        .rename(columns={"index": "label"})
    )
    per_class_df = per_class_df[per_class_df["label"] != "accuracy"]
    per_class_df["label"] = per_class_df["label"].astype(str)

    # Confusion matrix
    labels = np.unique(y_true)
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])

    return {
        "global_metrics": global_df,
        "per_class_metrics": per_class_df,
        "confusion_matrix": cm_df
    }

In [10]:
preds_df = aml.leader.predict(test).as_data_frame(use_multi_thread=True)
y_true = test['label'].as_data_frame(use_multi_thread=True).values.flatten()
y_pred = preds_df["predict"].values
y_prob = preds_df.drop(columns=["predict"]).values

assert np.allclose(y_prob.sum(axis=1), 1.0, atol=1e-6)

with ProbabilitySumToOneWarningSuppressor():
    oos_metrics = compute_oos_metrics(y_true, y_pred, y_prob)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%


In [11]:
display(oos_metrics["global_metrics"])

Unnamed: 0,accuracy,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted,f1_micro,f1_macro,f1_weighted,mcc,roc_auc_ovr_micro,roc_auc_ovr_macro,roc_auc_ovr_weighted,pr_auc_micro,pr_auc_macro,pr_auc_weighted,log_loss
0,0.842337,0.842337,0.65045,0.833001,0.842337,0.696697,0.842337,0.842337,0.645187,0.824857,0.627631,0.986526,0.968332,0.970216,0.95464,0.766892,0.898757,0.356408


In [12]:
display(oos_metrics["per_class_metrics"])

Unnamed: 0,label,precision,recall,f1-score,support
0,Benign,0.882581,0.734431,0.801719,2794.0
1,DDoS,0.390926,0.982863,0.559368,11204.0
2,DoS,0.0,0.0,0.0,11311.0
3,Reconnaissance,0.985281,0.931025,0.957385,93106.0
4,Theft,0.993464,0.835165,0.907463,364.0
6,macro avg,0.65045,0.696697,0.645187,118779.0
7,weighted avg,0.833001,0.842337,0.824857,118779.0


In [13]:
display(oos_metrics["confusion_matrix"])

Unnamed: 0,pred_Benign,pred_DDoS,pred_DoS,pred_Reconnaissance,pred_Theft
true_Benign,2052,3,0,739,0
true_DDoS,11,11012,0,181,0
true_DoS,6,10954,0,351,0
true_Reconnaissance,252,6168,0,86684,2
true_Theft,4,32,0,24,304


In [14]:
import json

# Convert all DataFrames to dicts
all_metrics_dict = {
    "global_metrics": oos_metrics["global_metrics"].to_dict(orient="records"),
    "per_class_metrics": oos_metrics["per_class_metrics"].to_dict(orient="records"),
    "confusion_matrix": {
        "index": oos_metrics["confusion_matrix"].index.tolist(),
        "columns": oos_metrics["confusion_matrix"].columns.tolist(),
        "data": oos_metrics["confusion_matrix"].values.tolist()
    }
}

# Save to a single JSON file
with open(f"{BASE_FILENAME}_results_oos.json", "w") as f:
    json.dump(all_metrics_dict, f, indent=2)

In [15]:
h2o.shutdown(prompt=False)

H2O session _sid_9a27 closed.


  h2o.shutdown(prompt=False)
