In [None]:
dataset_path = '../../2025-07-05/Input_Multiclass/KDD_Cup_1999_Multiclass.parquet'
output_folder = "../../2025-07-05/Output_Multiclass_Refitter_Temp/KDD_Cup_1999_Multiclass/hpo_pareto_refit"
target_column = 'label'
handle_object_cols = 'keep'
sampling_rate_global = None # 0.10
sampling_rate_sets = 0.10
sample_sets = ['train']
min_samples_per_class = 1
feature_selection_threshold = 0.99
sample_filtering_quantile = 0.10
hpo_n_trials = 10 # 1000
hpo_timeout = 60 # 3600
num_boost_round = 100 # 500
early_stopping_rounds = 10 # 50
n_jobs = -1
random_state = 42
plot_param_importances = False

In [None]:
from optuna.exceptions import ExperimentalWarning
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ExperimentalWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Step 1: Data Preprocessing

In [None]:
import cudf
import cupy as cp
import numpy as np
import pandas as pd
import random
import time

random.seed(random_state)
cp.random.seed(random_state)
np.random.seed(random_state)

global_start = time.time()

# Step 1: read dataset from parquet file
df_full = cudf.read_parquet(dataset_path)

col_info = {str(col): {} for col in df_full.columns}

df_full.shape

In [None]:
df_full

In [None]:
# Step 2: sample dataset if specified
if sampling_rate_global:
    df_full = df_full.sample(frac=sampling_rate_global, random_state=random_state)

for col in df_full.columns:
    col_info[str(col)].update({'nunique_1': df_full[col].nunique(), 'dtype_1': df_full[col].dtype, 'mem_1': df_full[col].memory_usage() / 1024**2})

df_full.shape

In [None]:
# Step 3: drop object columns if specified
if handle_object_cols != 'keep':
    object_cols = df_full.drop(columns=[target_column]).select_dtypes(include='object').columns    
    if handle_object_cols == 'drop' and len(object_cols) > 0:
        df_full = df_full.drop(columns=object_cols)
    elif handle_object_cols == 'encode' and len(object_cols) > 0:
        for col in object_cols:
            n_unique = df_full[col].nunique()
            if n_unique <= 1000:
                df_full[col] = df_full[col].astype('category')
            else:
                col_str = df_full[col].astype(str)
                hashed = col_str.hash_values(seed=seed).astype('int64') + seed
                df_full[col] = (hashed % 1024).astype('int16')

for col in df_full.columns:
    col_info[str(col)].update({'nunique_2': df_full[col].nunique(), 'dtype_2': df_full[col].dtype, 'mem_2': df_full[col].memory_usage() / 1024**2})

df_full.shape

In [None]:
import json

with open(dataset_path.replace('.parquet', '.json'), 'r', encoding='utf-8') as f:
    metadata = json.load(f)

# Step 4: restore dtypes for other columns from metadata
for col, dtype in metadata["dtypes"].items():
    if col in df_full.columns:# and col not in object_cols:
        df_full[col] = df_full[col].astype(dtype)

for col in df_full.columns:
    col_info[str(col)].update({'nunique_3': df_full[col].nunique(), 'dtype_3': df_full[col].dtype, 'mem_3': df_full[col].memory_usage() / 1024**2})

In [None]:
pd.DataFrame.from_dict(col_info).T

In [None]:
df_full.head(10)

In [None]:
df_full[target_column].value_counts()

In [None]:
# Factorize target column
df_full[target_column], unique_values = df_full[target_column].factorize()
assert df_full[target_column].nunique() >= 2, "Classification requires two or more classes."
label_to_index = {value: i for i, value in enumerate(unique_values.to_pandas())}
index_to_label = {v: k for k, v in label_to_index.items()}
labels = list(label_to_index.keys())
df_full[target_column] = df_full[target_column].astype('int8')

# Detect and assign codes to categorical columns
numeric_columns = df_full.drop(columns=[target_column]).select_dtypes(include=['number']).columns.tolist()
categorical_cols = df_full.drop(columns=[target_column]).select_dtypes(include=['category']).columns.tolist()

df_full[target_column].value_counts()

In [None]:
from cuml.model_selection import train_test_split

def ensure_min_samples_per_class(df, stratify_col, min_samples_per_class, random_state):
    """Ensures that each class has at least `min_samples_per_class` samples using oversampling."""
    class_counts = df[stratify_col].value_counts().to_pandas()

    # Oversample minority classes
    oversampled = [
        df[df[stratify_col] == c].sample(n=min_samples_per_class, replace=True, random_state=random_state)
        for c in class_counts[class_counts < min_samples_per_class].index
    ]
    df_oversampled = cudf.concat([df] + oversampled, ignore_index=True) if oversampled else df

    return df_oversampled.reset_index(drop=True)  # Fix index mismatch

def restore_dtypes(df, dtypes):
    for col, dtype in dtypes.to_dict().items():
        df[col] = df[col].astype(dtype)

def assign_subsets(df, stratify_col, train_frac, val_frac, test_frac, random_state):
    """Splits data into mutually exclusive train/val/test subsets before applying stratified sampling."""
    assert train_frac + val_frac + test_frac == 1.0, "Fractions must sum to 1"

    # Assign subset labels
    df_train, df_temp = train_test_split(df, test_size=(1 - train_frac), stratify=df[stratify_col], random_state=random_state)
    df_val, df_test = train_test_split(df_temp, test_size=(test_frac / (val_frac + test_frac)), stratify=df_temp[stratify_col], random_state=random_state)

    df_train["subset"] = "train"
    df_val["subset"] = "val"
    df_test["subset"] = "test"

    return cudf.concat([df_train, df_val, df_test]).reset_index(drop=True)

def sample_group(x):
    """Helper function to stratify sample while ensuring class presence."""
    n_samples = max(min_samples_per_class, int(len(x) * sampling_rate_sets))
    return x.sample(n=n_samples, replace=len(x) < n_samples, random_state=random_state)

def stratified_sample(df, stratify_col, sample_sets, sampling_rate_sets, min_samples_per_class, random_state):
    """Applies stratified sampling while ensuring minimum samples per class, only for selected subsets."""

    # Apply stratified sampling only for the requested subsets
    df_sampled = df.groupby(["subset", stratify_col], group_keys=False).apply(
        lambda x: sample_group(x) if x["subset"].iloc[0] in sample_sets else x
    )

    return df_sampled.reset_index(drop=True)

In [None]:
df_dtypes_before = df_full.dtypes.copy(deep=True).sort_index()

# Ensure enough samples before splitting
split_ok = False
while not split_ok:
    df_full = ensure_min_samples_per_class(df_full, target_column, min_samples_per_class, random_state)
    df_full = df_full.reset_index(drop=True)  # Reset index before splitting
    df_dtypes_backup = df_full.dtypes.copy(deep=True)

    # Factorize for stratification
    category_mappings = {}

    for col, dtype in df_full.dtypes.to_dict().items():
        if dtype == 'category':
            codes, uniques = df_full[col].factorize()
            df_full[col] = codes.astype('int32')
            category_mappings[col] = uniques.to_pandas().tolist()

    try:
        # Assign mutually exclusive train/val/test subsets
        df_full = assign_subsets(df_full, "label", train_frac=0.6, val_frac=0.2, test_frac=0.2, random_state=random_state)
        split_ok = True  # If it succeeds, exit loop
    except ValueError as e:
        print(f"Resampling due to insufficient class representation (min_samples_per_class={min_samples_per_class})...")
        min_samples_per_class += 1  # Increment dynamically and retry
    
    for col, dtype in df_dtypes_backup.items():
        if col in category_mappings:
            # Restore category values from factorized codes
            mapping = dict(enumerate(category_mappings[col]))
            df_full[col] = df_full[col].map(mapping).astype('category')
        else:
            # Restore other dtypes (numeric, bool, etc.)
            df_full[col] = df_full[col].astype(dtype)

print(f"Minimal oversampling completed successfully (min_samples_per_class={min_samples_per_class}).")

df_full[target_column].value_counts()

df_dtypes_after = df_full.dtypes.copy(deep=True).drop('subset').sort_index()

assert df_dtypes_before.equals(df_dtypes_after), "Dtype mismatch after assigning subsets"

In [None]:
# Fill missing categorical values
def fill_categorical_nas(dfs):
    return
    for df in dfs:
        for col in df.select_dtypes(include=['category']).columns:
            if df[col].to_pandas().isna().sum() > 0:
                df[col] = cudf.Series(df[col].to_pandas().astype(str).replace("nan", "missing").astype("category"))

# Extract final datasets
df_train_full = df_full[df_full["subset"] == "train"].drop(columns=["subset"])
df_val_full = df_full[df_full["subset"] == "val"].drop(columns=["subset"])
df_test_full = df_full[df_full["subset"] == "test"].drop(columns=["subset"])

# Convert back to X, y format
X_train_full, y_train_full = df_train_full.drop(columns=[target_column]), df_train_full[target_column]
X_val_full, y_val_full = df_val_full.drop(columns=[target_column]), df_val_full[target_column]
X_test_full, y_test_full = df_test_full.drop(columns=[target_column]), df_test_full[target_column]

fill_categorical_nas([X_train_full, X_val_full, X_test_full])

# Ensure disjoint splits
assert set(X_train_full.to_pandas().index).isdisjoint(set(X_val_full.to_pandas().index)), "Train and Validation sets are not disjoint!"
assert set(X_train_full.to_pandas().index).isdisjoint(set(X_test_full.to_pandas().index)), "Train and Test sets are not disjoint!"
assert set(X_val_full.to_pandas().index).isdisjoint(set(X_test_full.to_pandas().index)), "Validation and Test sets are not disjoint!"

# Ensure all classes are present in each split
assert y_train_full.nunique() == df_full[target_column].nunique(), "Some classes are missing in Train!"
assert y_val_full.nunique() == df_full[target_column].nunique(), "Some classes are missing in Validation!"
assert y_test_full.nunique() == df_full[target_column].nunique(), "Some classes are missing in Test!"

# Print final distributions
print(f"Train      : {len(df_train_full)} samples ({(100.0 * len(df_train_full) / len(df_full)):.2f}%), {y_train_full.nunique()} unique classes ({sorted(y_train_full.to_pandas().unique().tolist())})")
print(f"Validation : {len(df_val_full)} samples ({(100.0 * len(df_val_full) / len(df_full)):.2f}%), {y_val_full.nunique()} unique classes ({sorted(y_train_full.to_pandas().unique().tolist())})")
print(f"Test       : {len(df_test_full)} samples ({(100.0 * len(df_test_full) / len(df_full)):.2f}%), {y_test_full.nunique()} unique classes ({sorted(y_train_full.to_pandas().unique().tolist())})")

# Step 2: Model and Metrics

In [None]:
import os
import tempfile
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score,
    matthews_corrcoef, cohen_kappa_score,
    log_loss, roc_auc_score, confusion_matrix
)

num_classes = df_full[target_column].nunique()
loss_function = 'binary:logistic' if num_classes == 2 else 'multi:softprob'
eval_metric = 'logloss' if num_classes == 2 else 'mlogloss'
n_folds, shuffle, stratify = 5, True, True

def evaluate_multiclass_metrics(
    y_true,
    y_pred,
    y_prob=None,
    labels=None,
    label_names=None
):
    metrics = {}

    metrics["accuracy"] = accuracy_score(y_true, y_pred)
    metrics["balanced_accuracy"] = balanced_accuracy_score(y_true, y_pred)
    metrics["mcc"] = matthews_corrcoef(y_true, y_pred)
    metrics["cohen_kappa"] = cohen_kappa_score(y_true, y_pred)

    for avg in ["micro", "macro", "weighted"]:
        metrics[f"precision_{avg}"] = precision_score(y_true, y_pred, average=avg, zero_division=0)
        metrics[f"recall_{avg}"] = recall_score(y_true, y_pred, average=avg, zero_division=0)
        metrics[f"f1_{avg}"] = f1_score(y_true, y_pred, average=avg, zero_division=0)

    if y_prob is not None:
        try:
            metrics["log_loss"] = log_loss(y_true, y_prob, labels=labels)
        except:
            metrics["log_loss"] = np.nan

    if y_prob is not None and labels is not None and len(np.unique(y_true)) > 1:
        for avg in ["micro", "macro", "weighted"]:
            try:
                metrics[f"roc_auc_ovr_{avg}"] = roc_auc_score(y_true, y_prob, multi_class="ovr", average=avg, labels=labels)
                metrics[f"roc_auc_ovo_{avg}"] = roc_auc_score(y_true, y_prob, multi_class="ovo", average=avg, labels=labels)
            except:
                metrics[f"roc_auc_ovr_{avg}"] = np.nan
                metrics[f"roc_auc_ovo_{avg}"] = np.nan

    if labels is None:
        labels = np.unique(y_true)
    conf = confusion_matrix(y_true, y_pred, labels=labels)
    display_labels = label_names if label_names is not None else labels
    conf_df = pd.DataFrame(conf, index=[f"True_{l}" for l in display_labels],
                                 columns=[f"Pred_{l}" for l in display_labels])

    return metrics, conf_df

def f1_weighted_eval(preds, dtrain):
    y_true = dtrain.get_label()
    weights = dtrain.get_weight()
    # Convert probabilities to class labels
    if num_classes > 2:
        y_pred = np.argmax(preds.reshape(y_true.shape[0], -1), axis=1)
    else:
        y_pred = (preds > 0.5).astype(int)
    # Use weights if provided
    if weights is not None and len(weights) > 0:
        f1 = f1_score(y_true, y_pred, average="weighted", sample_weight=weights)
    else:
        f1 = f1_score(y_true, y_pred, average="weighted")
    return "f1_weighted", f1

def get_model_size(model):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".ubj") as temp_model:
        model_path = temp_model.name
    model.save_model(model_path)
    model_size_mb = os.path.getsize(model_path) / (1024 * 1024)
    os.remove(model_path)
    return round(model_size_mb, 2)

# Default XGB Booster parameters
default_booster_params = {
    "objective": loss_function,                     # Multi-class classification
    "early_stopping_rounds": early_stopping_rounds, # Number of iterations
    "num_boost_round": num_boost_round,             # Number of CV folds
    "eval_metric": eval_metric,                     # Log loss
    "device": 'cuda'                                # GPU (CUDA)
}
if num_classes > 2:
    default_booster_params["num_class"] = num_classes # Only for multi-class classification

# Default XGB CV parameters
default_cv_params = {
    "params": default_booster_params,               # Booster parameters
    "early_stopping_rounds": early_stopping_rounds, # Number of rounds before early stopping
    "num_boost_round": num_boost_round,             # Number of iterations
    "nfold": n_folds,                               # Number of CV folds
    "shuffle": shuffle,                             # Shuffle samples before creating folds
    "stratified": stratify,                         # Stratify classes on CV split
    "metrics": eval_metric,                         # Log loss
    "feval": f1_weighted_eval,                      # Monitor weighted F1 score
    "seed": random_state                            # Passed to numpy.random.seed
}

# Default XGB training parameters
default_train_params = {
    "early_stopping_rounds": early_stopping_rounds, # Number of rounds before early stopping
    "num_boost_round": num_boost_round,             # Number of iterations
}

# Function to Train, Validate, and Test XGBoost Model
def train_xgb(
    X_train, X_val, X_test, y_train, y_val, y_test,
    sample_weights=None,
    cv=False,
    custom_booster_params=None,
    metrics_filename: str = None,
    cm_filename: str = None,
    model_filename: str = None,
    verbose=1
):
    # Warmup verification
    assert sample_weights is None or len(sample_weights) == len(y_train), "Sample weights must align with training labels"

    # Create DMatrix with feature names
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, feature_names=X_train.columns.tolist(), weight=sample_weights)
    dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True, feature_names=X_val.columns.tolist()) if X_val is not None else None
    dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True, feature_names=X_test.columns.tolist()) if X_test is not None else None

    # Merge default booster params with custom ones
    booster_params = {**default_booster_params, **(custom_booster_params or {})}

    if cv:
        cv_params = {
            **default_cv_params,
            "params": booster_params,
            "verbose_eval": verbose
        }
        training_start = time.time()
        results = xgb.cv(dtrain=dtrain, **cv_params)
        training_end = time.time()
        training_time = (training_end - training_start) / n_folds
        latency = training_time / len(y_train)
        f1_score_ans = float(results['test-f1_weighted-mean'].mean())
        model, model_size = None, None

        # Persist the model properly
        if metrics_filename:
            os.makedirs(os.path.dirname(metrics_filename), exist_ok=True)
            results.to_csv(metrics_filename, index=True)

    else:
        training_start = time.time()
        model = xgb.train(
            params=booster_params,
            dtrain=dtrain,
            evals=[(dtrain, "Train"), (dval, "Validation")],
            **default_train_params
        )
        training_end = time.time()
        training_time = training_end - training_start

        # Predict
        test_start = time.time()
        y_pred_probs = model.predict(dtest)
        y_pred = np.argmax(y_pred_probs, axis=1) if num_classes > 2 else (y_pred_probs > 0.5).astype(int)
        test_end = time.time()
        test_time = test_end - test_start

        latency = test_time / len(y_test)

        model_size = get_model_size(model)

        metrics_dict, confusion_df = evaluate_multiclass_metrics(
            y_true=y_test.to_numpy(),
            y_pred=y_pred,
            y_prob=y_pred_probs,
            labels=list(index_to_label.keys()),
            label_names=[index_to_label[i] for i in index_to_label]
        )
        metrics_dict.update({
            'training_time': training_time, 'test_time': test_time, 'latency': latency, 'model_size': model_size
        })
        
        f1_score_ans = metrics_dict['f1_weighted']

        # Persist the results properly
        if metrics_filename:
            os.makedirs(os.path.dirname(metrics_filename), exist_ok=True)
            with open(metrics_filename, "w", encoding="utf-8") as f:
                json.dump(metrics_dict, f, indent=2)
        if cm_filename:
            os.makedirs(os.path.dirname(cm_filename), exist_ok=True)
            confusion_df.to_csv(cm_filename, index=True)
        if model_filename:
            os.makedirs(os.path.dirname(model_filename), exist_ok=True)
            model.save_model(model_filename)

    return model, training_time, latency, f1_score_ans, model_size

In [None]:
# train_xgb(X_train_full, None, None, y_train_full, None, None, cv=True)

In [None]:
# train_xgb(X_train_full, X_val_full, X_test_full, y_train_full, y_val_full, y_test_full, cv=False)

# Step 3: Baseline Evaluation (full dataset)

In [None]:
# Train and evaluate using train_xgb function
model_full, train_time_full, latency_full, f1_weighted_full, model_size_full = train_xgb(
    X_train_full, X_val_full, X_test_full,
    y_train_full, y_val_full, y_test_full,
    cv=False,  # Cross-validation only used during HPO
    metrics_filename=f"{output_folder}/xgb_full_metrics.json",
    cm_filename=f"{output_folder}/xgb_full_cm.csv",
    model_filename=f"{output_folder}/xgb_full_model.json"
)

# Print results
print(f"Training Time (full): {train_time_full:.3f} seconds")
print(f"Latency (full): {latency_full:.2e} seconds")
print(f"Weighted F1-Score (full): {f1_weighted_full:.6f}")
print(f"Model Size: {model_size_full} MB")

# Step 4: Updated Evaluation (with Sampling)

In [None]:
# Apply stratified sampling only to the selected subsets
df_sampled = stratified_sample(df_full, target_column, sample_sets, sampling_rate_sets, min_samples_per_class, random_state)

# Extract final datasets
df_train_sampled = df_sampled[df_sampled["subset"] == "train"].drop(columns=["subset"])
df_val_sampled = df_sampled[df_sampled["subset"] == "val"].drop(columns=["subset"])
df_test_sampled = df_sampled[df_sampled["subset"] == "test"].drop(columns=["subset"])

# Convert back to X, y format
X_train_sampled, y_train_sampled = df_train_sampled.drop(columns=[target_column]), df_train_sampled[target_column]
X_val_sampled, y_val_sampled = df_val_sampled.drop(columns=[target_column]), df_val_sampled[target_column]
X_test_sampled, y_test_sampled = df_test_sampled.drop(columns=[target_column]), df_test_sampled[target_column]

fill_categorical_nas([X_train_sampled, X_val_sampled, X_test_sampled])

# Ensure disjoint splits
assert set(X_train_sampled.to_pandas().index).isdisjoint(set(X_val_sampled.to_pandas().index)), "Train and Validation sets are not disjoint!"
assert set(X_train_sampled.to_pandas().index).isdisjoint(set(X_test_sampled.to_pandas().index)), "Train and Test sets are not disjoint!"
assert set(X_val_sampled.to_pandas().index).isdisjoint(set(X_test_sampled.to_pandas().index)), "Validation and Test sets are not disjoint!"

# Ensure all classes are present in each split
assert y_train_sampled.nunique() == df_sampled[target_column].nunique(), "Some classes are missing in Train!"
assert y_val_sampled.nunique() == df_sampled[target_column].nunique(), "Some classes are missing in Validation!"
assert y_test_sampled.nunique() == df_sampled[target_column].nunique(), "Some classes are missing in Test!"

# Print final distributions
print(f"Train      : {len(df_train_sampled)} samples ({(100.0 * len(df_train_sampled) / len(df_full)):.2f}%), {y_train_sampled.nunique()} unique classes ({sorted(y_train_sampled.to_pandas().unique().tolist())})")
print(f"Validation : {len(df_val_sampled)} samples ({(100.0 * len(df_val_sampled) / len(df_full)):.2f}%), {y_val_sampled.nunique()} unique classes ({sorted(y_train_sampled.to_pandas().unique().tolist())})")
print(f"Test       : {len(df_test_sampled)} samples ({(100.0 * len(df_test_sampled) / len(df_full)):.2f}%), {y_test_sampled.nunique()} unique classes ({sorted(y_train_sampled.to_pandas().unique().tolist())})")

In [None]:
# Train and evaluate using train_xgb function
model_sampled, train_time_sampled, latency_sampled, f1_weighted_sampled, model_size_sampled = train_xgb(
    X_train_sampled, X_val_sampled, X_test_sampled,
    y_train_sampled, y_val_sampled, y_test_sampled,
    cv=False,  # Cross-validation only used during HPO
    metrics_filename=f"{output_folder}/xgb_sampled_metrics.json",
    cm_filename=f"{output_folder}/xgb_sampled_cm.csv",
    model_filename=f"{output_folder}/xgb_sampled_model.json"
)

# Print results
print(f"Training Time (sampled): {train_time_sampled:.3f} seconds")
print(f"Latency (sampled): {latency_sampled:.2e} seconds")
print(f"Weighted F1-Score (sampled): {f1_weighted_sampled:.6f}")
print(f"Model Size: {model_size_sampled} MB")

# Step 5: Updated Evaluation (with Sampling and Feature Selection)

In [None]:
import matplotlib.pyplot as plt

# Calculate and sort features by importance
feature_importance = model_sampled.get_score(importance_type="gain")
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
features, importance = zip(*sorted_features)

# Create the plot
plt.figure(figsize=(10, 8))
bars = plt.barh(features, importance, color='skyblue')

# Annotate each bar with its importance value
for bar, value in zip(bars, importance):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f" {value:.3f}", va='center')

# Labels and title
plt.xlabel("Gain (Importance)")
plt.ylabel("Features")
plt.title("Feature Importance (Gain)")
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)  # Add grid for better readability

plt.savefig(f"{output_folder}/xgb_feature_importance_gain.pdf", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
# Get feature importance
feature_importance = model_sampled.get_score(importance_type="gain")
importance_df = cudf.DataFrame(list(feature_importance.items()), columns=["Feature", "Importance"])
importance_df = importance_df.sort_values(by="Importance", ascending=False)

# Normalize importance scores
importance_df["Cumulative_Importance"] = importance_df["Importance"].cumsum() / importance_df["Importance"].sum()

# Find the smallest set of features that explains at least 95% of the importance
N = np.argmax(importance_df["Cumulative_Importance"] >= feature_selection_threshold) + 1  # Adjust threshold as needed

# Select top features
selected_features = importance_df["Feature"][:N].to_pandas().tolist()

print(f"Optimal number of features: {N} (from {len(feature_importance)})")

importance_df.to_csv(f"{output_folder}/xgb_feature_importance_gain.csv", index=False)
importance_df

In [None]:
# Reduce the feature space for train, validation, and test sets
X_train_reduced = X_train_sampled[selected_features]
X_val_reduced = X_val_sampled[selected_features]
X_test_reduced = X_test_sampled[selected_features]

fill_categorical_nas([X_train_reduced, X_val_reduced, X_test_reduced])

# Train and evaluate using train_xgb function
model_reduced, train_time_reduced, latency_reduced, f1_weighted_reduced, model_size_reduced = train_xgb(
    X_train_reduced, X_val_reduced, X_test_reduced,
    y_train_sampled, y_val_sampled, y_test_sampled,
    cv=False,  # Cross-validation only used during HPO
    metrics_filename=f"{output_folder}/xgb_reduced_metrics.json",
    cm_filename=f"{output_folder}/xgb_reduced_cm.csv",
    model_filename=f"{output_folder}/xgb_reduced_model.json"
)

# Print results
print(f"Training Time (reduced): {train_time_reduced:.3f} seconds")
print(f"Latency (reduced): {latency_reduced:.2e} seconds")
print(f"Weighted F1-Score (reduced): {f1_weighted_reduced:.6f}")
print(f"Model Size (reduced): {model_size_reduced} MB")

# Step 6: Updated Evaluation (with Sampling, Feature Selection, and Row Filtering)

In [None]:
def convert_categorical_to_frequency(df, normalize=True):
    """Convert categorical features to frequency encoding with reversibility support."""
    df_encoded = df.copy()
    category_mappings = {}
    row_mappings = {}

    for col in df_encoded.select_dtypes(include=['category']).columns:
        # Calculate normalized frequencies
        freqs = df[col].to_pandas().value_counts().astype('float32')
        if normalize:
            freqs = freqs / len(df)
        freq_map = freqs.to_dict()

        # Encode column
        df_encoded[col] = df[col].to_pandas().map(freq_map).astype('float32')

        # Save both mappings
        category_mappings[col] = freq_map
        row_mappings[col] = df[col].reset_index(drop=True)

    return df_encoded, category_mappings, row_mappings


def revert_frequency_encoding(df_encoded, row_mappings):
    """Revert frequency-encoded DataFrame to original categories using stored row-wise values."""
    df_reverted = df_encoded.copy().reset_index(drop=True)

    for col in row_mappings:
        if col in df_reverted.columns:
            df_reverted[col] = row_mappings[col]  # restore from stored original

    return df_reverted


def filter_low_mean_samples(X_encoded: cudf.DataFrame, y: cudf.Series, quantile_threshold: float):
    """Filter low-mean samples per class while preserving all class labels."""
    
    # Step 1: Combine features and label
    combined_df = X_encoded.copy()
    combined_df[target_column] = y

    # Step 2: Compute row means once
    row_means = X_encoded.mean(axis=1)
    combined_df['row_mean'] = row_means

    # Step 3: Compute quantile thresholds per class
    class_thresholds = (
        combined_df[[target_column, 'row_mean']]
        .groupby(target_column)
        .quantile(q=quantile_threshold)
        .rename(columns={'row_mean': 'threshold'})
        .reset_index()
    )

    # Step 4: Join thresholds back to combined_df
    combined_df = combined_df.merge(class_thresholds, on=target_column, how='left')

    # Step 5: Apply filtering
    filtered_df = combined_df[combined_df['row_mean'] > combined_df['threshold']]

    # Step 6: Ensure all classes are represented
    present_classes = set(cp.asnumpy(filtered_df[target_column].unique()))
    all_classes = set(cp.asnumpy(y.unique()))
    missing_classes = all_classes - present_classes

    for label in missing_classes:
        class_df = combined_df[combined_df[target_column] == label]
        best_idx = class_df['row_mean'].to_pandas().idxmax()
        best_sample = class_df.loc[[best_idx]]
        filtered_df = cudf.concat([filtered_df, best_sample])

    # Step 7: Final separation
    X_filtered = filtered_df.drop([target_column, 'row_mean', 'threshold'], axis=1)
    y_filtered = filtered_df[target_column]

    return X_filtered, y_filtered

In [None]:
# Apply frequency encoding to the training set
print('convert_categorical_to_frequency')
X_train_filtered_encoded, cat_map, row_map = convert_categorical_to_frequency(X_train_reduced)

# Apply per-class row filtering
print('filter_low_mean_samples')
X_train_filtered, y_train_filtered = filter_low_mean_samples(
    X_train_filtered_encoded, 
    y_train_sampled, 
    sample_filtering_quantile
)

assert set(X_train_filtered.columns) == set(X_train_reduced.columns)

# Revert frequency encoding
print('revert_frequency_encoding')
# Subset row_map to match the filtered row indices
filtered_row_map = {col: row_map[col].loc[X_train_filtered.index] for col in row_map}
X_train_filtered = revert_frequency_encoding(X_train_filtered, row_map)
fill_categorical_nas([X_train_filtered])

assert set(X_train_filtered.columns) == set(X_train_reduced.columns)

# Print results
print(f"Original training set size: {len(X_train_reduced)} rows")
print(f"Filtered training set size: {len(X_train_filtered)} rows")

print(f"\nClass distribution before filtering:")
print(y_train_sampled.value_counts().sort_index())
print(f"\nClass distribution after filtering:")
print(y_train_filtered.value_counts().sort_index())

# Final validations
assert len(X_train_filtered) == len(y_train_filtered), "Sample count mistmatch between X and y"
assert len(X_train_filtered.columns) == len(X_train_reduced.columns), "Column count changed"
assert set(X_train_filtered.columns) == set(X_train_reduced.columns), "Column names changed"
assert y_train_filtered.nunique() == y_train_sampled.nunique(), "Class count changed"
assert y_train_filtered.nunique() == df_full[target_column].nunique(), "Some classes are missing"
assert all(y_train_filtered.to_pandas().value_counts() >= 1), "Some classes have zero samples"
assert X_train_filtered.index.is_unique, "Duplicate indices in filtered data"

In [None]:
# Train and evaluate using train_xgb function
clf_filtered, train_time_filtered, latency_filtered, f1_weighted_filtered, model_size_filtered = train_xgb(
    X_train_filtered, X_val_reduced, X_test_reduced,
    y_train_filtered, y_val_sampled, y_test_sampled,
    cv=False,  # Cross-validation only used during HPO
    metrics_filename=f"{output_folder}/xgb_filtered_metrics.json",
    cm_filename=f"{output_folder}/xgb_filtered_cm.csv",
    model_filename=f"{output_folder}/xgb_filtered_model.json"
)

# Print results
print(f"Training Time (filtered): {train_time_filtered:.3f} seconds")
print(f"Latency (filtered): {latency_filtered:.2e} seconds")
print(f"Weighted F1-Score (filtered): {f1_weighted_filtered:.6f}")
print(f"Model Size (filtered): {model_size_filtered} MB")

# Step 7: HPO

## Step 7.1: Numeric Scaling Wrappers

In [None]:
scaling_methods = ['none', 'maxabs', 'minmax', 'norm', 'robust', 'standard'] if numeric_columns else ['none']

In [None]:
from cuml.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer, RobustScaler, StandardScaler

def make_numeric_scaler(scaling_method, X):
    numeric_columns = X.select_dtypes(include=['number']).columns
    non_numeric_columns = X.columns.difference(numeric_columns)

    scalers = {
        'none': None,
        'maxabs': MaxAbsScaler(),
        'minmax': MinMaxScaler(),
        'norm': Normalizer(),
        'robust': RobustScaler(),
        'standard': StandardScaler()
    }

    scaler = scalers.get(scaling_method)

    def scale_numeric_features(X_input, fit=False):
        X_numeric_pd = X_input[numeric_columns].copy()
        X_non_numeric_pd = X_input[non_numeric_columns].copy()
        X_output = None

        if scaling_method == 'none':
            X_output = X_input
        else:
            if fit:
                scaler.fit(X_numeric_pd)
            X_numeric_scaled_pd = scaler.transform(X_numeric_pd)
            X_numeric_scaled_cu = X_numeric_scaled_pd.astype('float32') # cudf.from_pandas(X_numeric_scaled_pd)
            X_numeric_scaled_cu.columns = list(numeric_columns)
            X_numeric_scaled_cu.index = X_input.index
            # X_numeric_scaled_cu = X_numeric_scaled_cu.reset_index(drop=True)
            X_non_numeric_cu = X_non_numeric_pd # cudf.from_pandas(X_non_numeric_pd)
            X_non_numeric_cu.columns = list(non_numeric_columns)
            X_non_numeric_cu.index = X_input.index
            # X_non_numeric_cu = X_non_numeric_cu.reset_index(drop=True)
            X_output = cudf.concat([X_numeric_scaled_cu, X_non_numeric_cu], axis=1)
            assert X_input.shape == X_output.shape, "Shape mismatch after concat."

        return X_output

    return scale_numeric_features

## Step 7.2: Categorical Encoding Wrappers

In [None]:
encoding_methods = ['none', 'onehot', 'ordinal', 'frequency'] if categorical_cols else ['none']

In [None]:
from cuml.preprocessing import LabelEncoder as CumlLabelEncoder
from cuml.preprocessing import OneHotEncoder as CumlOneHotEncoder

# Define categorical encoding function
def make_categorical_encoder(encoding_method, X, max_categories_for_onehot=10):
    """Creates an encoder for categorical features while keeping numerical columns unchanged."""

    categorical_columns = X.select_dtypes(include=['category']).columns
    numeric_columns = X.columns.difference(categorical_columns)  # Numeric columns remain unchanged

    # Split categorical columns for One-Hot vs. None Encoding
    onehot_columns = [col for col in categorical_columns if X[col].nunique() <= max_categories_for_onehot]
    none_columns = list(set(categorical_columns) - set(onehot_columns))  # High-cardinality features

    # Ensure onehot_columns is a valid list (avoiding Index issues)
    onehot_columns = list(onehot_columns) if hasattr(onehot_columns, 'tolist') else onehot_columns

    # Define encoding strategies
    encoders = {
        'none': None,   # No transformation (Handled separately)
        'onehot': CumlOneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False),
        'ordinal': CumlLabelEncoder(handle_unknown='ignore'),
        'frequency': None  # Frequency encoding requires custom logic
    }

    encoder = encoders.get(encoding_method)

    # Store frequency map for reuse
    frequency_col_map, frequency_row_map = {}, {}

    def encode_categorical_features(X_input, fit=False):
        nonlocal encoder
        """Encodes categorical features while keeping numeric features unchanged."""
        _categorical_columns = X_input.select_dtypes(include=['category']).columns
        _numeric_columns = X_input.columns.difference(_categorical_columns)
        X_numeric = X_input[_numeric_columns].copy()
        X_categorical = X_input[_categorical_columns].copy()

        if encoding_method == 'none':
            # Convert categorical columns to category dtype and assign codes
            # X_categorical = X_categorical.apply(lambda col: col.astype('category').cat.codes)
            return X_input

        elif encoding_method == 'onehot':
            X_numeric_copy = X_numeric.copy()  # Start with numeric columns
            # Handle categorical columns together for cuML
            if fit:
                encoder.fit(X_categorical[onehot_columns])
            X_onehot = cudf.DataFrame(
                data=encoder.transform(X_categorical[onehot_columns]).astype('int8'), 
                columns=encoder.get_feature_names(onehot_columns)
            )
            # Ensure index alignment
            X_onehot.index = X_numeric_copy.index  
            # Add encoded columns to result
            X_encoded = cudf.concat([X_numeric_copy, X_onehot], axis=1)

            # Apply 'none' encoding (category codes) for high-cardinality features
            if none_columns:
                # X_none_encoded = X_categorical[none_columns].apply(lambda col: col.cat.codes)
                X_encoded = cudf.concat([X_encoded, X_input[none_columns]], axis=1)

            return X_encoded

        elif encoding_method == 'ordinal':
            X_numeric_copy = X_numeric.copy()  # Start with numeric columns
            X_ordinal_partials = []
            # Handle each categorical column separately for cuML
            for col in X_categorical.columns:
                if fit:
                    encoder.fit(X_categorical[col])
                X_ordinal = cudf.DataFrame(
                    data=encoder.transform(X_categorical[col]).astype('category'),
                    columns=[col]
                )
                # Ensure index alignment
                X_ordinal.index = X_numeric_copy.index  
                # Add encoded columns to result
                X_ordinal_partials.append(X_ordinal)
            X_encoded = cudf.concat([X_numeric_copy, cudf.concat(X_ordinal_partials, axis=1)], axis=1)

            return X_encoded

        elif encoding_method == 'frequency':
            nonlocal frequency_col_map, frequency_row_map
            X_numeric_copy = X_numeric.copy()  # Start with numeric columns

            if fit:
                # Fit and encode
                X_categorical_encoded, frequency_col_map, frequency_row_map = \
                    convert_categorical_to_frequency(X_categorical)
            else:
                # Transform using saved frequency map
                X_categorical_encoded = X_categorical.copy()
                for col in X_categorical.columns:
                    mapped = X_categorical[col].to_pandas().map(frequency_col_map.get(col, {}))
                    X_categorical_encoded[col] = cudf.Series(mapped, index=X_categorical.index).astype('float32')

            return cudf.concat([X_numeric_copy, X_categorical_encoded], axis=1)

        else:
            raise ValueError(f"Unknown encoding method: {encoding_method}")

        # Ensure all categorical columns are replaced by their codes
        # X_categorical = X_categorical.apply(lambda col: col.cat.codes)

    return encode_categorical_features

## Step 7.3: Data Balancing Wrappers

In [None]:
SAFE_MODE = False

def assert_or_raise(condition: bool, message: str):
    if SAFE_MODE and not condition:
        raise ValueError(message)

class CumlRandomResampler:
    def __init__(self, strategy: dict, sampling_type: str = 'over'):
        assert sampling_type in ['over', 'under'], "sampling_type must be 'over' or 'under'"
        self.strategy = strategy
        self.sampling_type = sampling_type

    def fit_resample(self, X: cudf.DataFrame, y: cudf.Series):
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        df = X.copy()
        df['__label__'] = y

        resampled_dfs = []

        for cls in df['__label__'].to_pandas().unique().tolist():
            cls_df = df[df['__label__'] == cls]
            count = len(cls_df)

            if self.sampling_type == 'over' and cls in self.strategy:
                target_total = self.strategy[cls]
                current_count = len(cls_df)
                to_add = target_total - current_count

                if to_add > 0:
                    extra = cls_df.sample(n=to_add, replace=True, random_state=42)
                    cls_df = cudf.concat([cls_df, extra], ignore_index=True)
                    assert_or_raise(cls_df.shape[0] == target_total,
                                    f"Over-sampling failed for class {cls}: expected {target_total}, got {cls_df.shape[0]}")
                    assert_or_raise(cls_df.shape[1] == df.shape[1],
                                    f"Column mismatch after over-sampling class {cls}")

                resampled_dfs.append(cls_df)

            elif self.sampling_type == 'under' and cls in self.strategy:
                target_keep = self.strategy[cls]
                to_remove = count - target_keep

                if to_remove >= count:
                    # Skip appending — fully removed
                    print(f"[Resampler] Class {cls} will be removed entirely (requested to remove {to_remove}, only {count} available)")
                    continue

                if to_remove > 0:
                    cls_df = cls_df.sample(n=target_keep, replace=False, random_state=42)
                    expected_rows = target_keep
                    expected_cols = df[df['__label__'] == cls].shape[1]
                    assert_or_raise(cls_df.shape[0] == expected_rows,
                                    f"Under-sampling failed for class {cls}: expected {expected_rows}, got {cls_df.shape[0]}")
                    assert_or_raise(cls_df.shape[1] == expected_cols,
                                    f"Column mismatch after under-sampling class {cls}")
                
                resampled_dfs.append(cls_df)

            else:
                # Not in strategy → keep class as-is
                resampled_dfs.append(cls_df)

        result_df = cudf.concat(resampled_dfs, ignore_index=True)
        X_resampled = result_df.drop(columns='__label__')
        y_resampled = result_df['__label__']

        # Final checks

        expected_total = sum(self.strategy.values())
        expected_classes = set(y.to_pandas().unique())
        actual_classes = set(y_resampled.to_pandas().unique())
        assert_or_raise(X_resampled.shape[0] == expected_total,
                        f"{self.sampling_type.capitalize()}-sampling total row count mismatch: expected {expected_total}, got {X_resampled.shape[0]}")
        assert_or_raise(y_resampled.shape[0] == expected_total,
                        f"{self.sampling_type.capitalize()}-sampling total label count mismatch: expected {expected_total}, got {y_resampled.shape[0]}")
        assert_or_raise(X_resampled.shape[1] == X.shape[1],
                        f"Feature count mismatch after resampling: expected {X.shape[1]}, got {X_resampled.shape[1]}")
        assert_or_raise(actual_classes == expected_classes,
                        f"Class mismatch after resampling: expected {expected_classes}, got {actual_classes}")

        return X_resampled, y_resampled

In [None]:
from cuml.neighbors import NearestNeighbors as CumlNearestNeighbors

In [None]:
def fit_resample(_X_train, _y_train, over_method, over_thresh, under_method, under_thresh):

    _X_names = _X_train.columns.tolist()
    _y_name = _y_train.name

    _X_train_copy = _X_train.copy()
    _y_train_copy = _y_train.copy()

    if over_thresh:
        value_counts = _y_train_copy.value_counts().to_dict()
        n_neighbors = min(5, min(value_counts.values()))
        n_generate = build_oversampling_strategy(value_counts, over_thresh)
        over_strategy = patch_oversampling_strategy(value_counts, n_generate)
        cat_features = [
            _X_train_copy.columns.get_loc(col)
            for col in _X_train_copy.select_dtypes(include=['category']).columns
        ]
        over_sampler = make_over_sampler(over_method, over_strategy, n_neighbors, cat_features)
        _X_train_copy, _y_train_copy = over_sampler.fit_resample(_X_train_copy, _y_train_copy)

    if under_thresh:
        value_counts = _y_train_copy.value_counts().to_dict()
        n_remove = build_undersampling_strategy(value_counts, under_thresh)
        under_strategy = patch_undersampling_strategy(value_counts, n_remove)
        under_sampler = make_under_sampler(under_method, under_strategy)
        _X_train_copy, _y_train_copy = under_sampler.fit_resample(_X_train_copy, _y_train_copy)

    return cudf.DataFrame(_X_train_copy, columns=_X_names), cudf.Series(_y_train_copy, name=_y_name)

### Step 7.3a: Oversampling

In [None]:
over_method_choices = ['random']#, 'smotenc']

In [None]:
over_threshold_choices = [float(f) for f in np.linspace(0, 4, num=17).round(2)] + ['auto']

In [None]:
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler

def build_oversampling_strategy(value_counts, threshold):
    n_occurrences = sum(value_counts.values())
    perfectly_balanced_occurrences = int(n_occurrences / len(value_counts.keys()))

    if threshold == "auto":
        n_generate = {
            class_: perfectly_balanced_occurrences - occ
                    if occ < perfectly_balanced_occurrences else 0
                    for class_, occ in value_counts.items()
        }
    else:
        n_generate = {
            class_: int(min(occ * threshold, perfectly_balanced_occurrences - occ))
                    if occ < perfectly_balanced_occurrences else 0
                    for class_, occ in value_counts.items()
        }
    return n_generate

def patch_oversampling_strategy(value_counts, n_generate):
    return {k: (value_counts[k] + n_generate[k]) for k in value_counts.keys()}

def make_over_sampler(over_method, over_strategy, n_neighbors, cat_features=None):
    if over_method == "random":
        return CumlRandomResampler(strategy=over_strategy, sampling_type="over")
    elif over_method == "smote":
        return SMOTE(k_neighbors=CumlNearestNeighbors(n_neighbors=n_neighbors), sampling_strategy=over_strategy)
    elif over_method == "smotenc":
        return SMOTENC(categorical_features=cat_features, k_neighbors=CumlNearestNeighbors(n_neighbors=n_neighbors), sampling_strategy=over_strategy)
    else:
        raise ValueError(f"Unknown oversampling method: {over_method}")

### Step 7.3b: Undersampling

In [None]:
under_method_choices = ['random']#, 'tomek']

In [None]:
under_threshold_choices = [float(f) for f in np.linspace(0, 0.95, num=20).round(2)] + ['auto']

In [None]:
from imblearn.under_sampling import TomekLinks as TomekLinksImblearn
from sklearn.utils import _safe_indexing  # Needed for compatibility

class TomekLinksCUDA(TomekLinksImblearn):
    def fit_resample(self, X, y):
        nn = CumlNearestNeighbors(n_neighbors=2)
        nn.fit(X)
        nns = nn.kneighbors(X, return_distance=False)[:, 1]

        links = self.is_tomek(y, nns, self.sampling_strategy_)
        self.sample_indices_ = np.flatnonzero(np.logical_not(links))

        return (
            _safe_indexing(X, self.sample_indices_),
            _safe_indexing(y, self.sample_indices_),
        )

In [None]:
from imblearn.under_sampling import RandomUnderSampler

def build_undersampling_strategy(value_counts, threshold):
    n_occurences = sum([n for n in value_counts.values()])
    perfectly_balanced_occurences = int(n_occurences / len(value_counts.keys()))
    if threshold == "auto":
        n_remove = {
            class_: occ - perfectly_balanced_occurences
                    if occ > perfectly_balanced_occurences else 0
                    for class_, occ in value_counts.items()
        }
    else:
        n_remove = {
            class_: int(min(occ * threshold, occ - perfectly_balanced_occurences))
            if occ > perfectly_balanced_occurences else 0
            for class_, occ in value_counts.items()
        }
    return n_remove

def patch_undersampling_strategy(value_counts, n_remove):
    return {k : (value_counts[k] - n_remove[k]) for k in value_counts.keys()}

def make_under_sampler(under_method, under_strategy):
    if under_method == "random":
        return CumlRandomResampler(strategy=under_strategy, sampling_type="under")
    elif under_method == "tomek":
        return TomekLinksCUDA(n_jobs=n_jobs)
    else:
        raise ValueError(f"Unknown undersampling method: {under_method}")

## Step 7.4: Study Preparation

In [None]:
y_train_filtered = cudf.Series(y_train_filtered)

if y_train_filtered.value_counts().min() < min_samples_per_class:
    df_temp = cudf.concat([
        X_train_filtered.reset_index(drop=True),
        y_train_filtered.reset_index(drop=True)
        ], axis='columns')
    df_temp = ensure_min_samples_per_class(df_temp, target_column, min_samples_per_class, random_state)
    X_train_filtered, y_train_filtered = df_temp.drop(columns=[target_column]), df_temp[target_column]
    assert y_train_filtered.value_counts().min() >= min_samples_per_class

In [None]:
assert isinstance(X_train_filtered, cudf.DataFrame)
assert isinstance(y_train_filtered, cudf.Series)

## Step 7.5: Study Execution

This part is skipped because the pareto-front configs will be refitted based on prevous HPO logs.

In [None]:
from pathlib import Path
import pandas as pd

hyperparam_keys = [
    'balancing_mode', 'over_method', 'over_threshold',
    'under_method', 'under_threshold', 'eta', 'max_depth',
    'min_child_weight', 'subsample', 'colsample_bytree',
    'lambda', 'alpha'
]

def is_pareto_efficient(df, score_col='f1_score', latency_col='latency'):
    data = df[[score_col, latency_col]].values
    is_efficient = np.ones(data.shape[0], dtype=bool)
    for i, (score, latency) in enumerate(data):
        if is_efficient[i]:
            is_dominated = (data[:, 0] >= score) & (data[:, 1] <= latency)
            is_dominated[i] = False
            is_efficient[is_dominated] = False
    return is_efficient

dataset_path_refit = dataset_path.replace('/Input_Multiclass/', '/Output_Multiclass_3600/').replace('.parquet', '/')

print(dataset_path_refit)

hpo_configs_to_refit = []

try:
    
    hpo_trials_df_filename = os.path.join(dataset_path_refit, 'xgb_hpo_trials.xlsx')

    print(hpo_trials_df_filename)
    
    hpo_trials_df = pd.read_excel(hpo_trials_df_filename)
    
    for col in hpo_trials_df.columns:
        try:
            # Try to replace ',' with '.' and convert to float
            if col in ['max_depth', 'min_child_weight']:
                hpo_trials_df[col] = hpo_trials_df[col].astype(int)
            else:
                hpo_trials_df[col] = hpo_trials_df[col].astype(str).str.replace(',', '.').astype(float)
        except ValueError:
            # If conversion fails (e.g., for categorical strings), skip the column
            continue
    
    # Get boolean mask for Pareto front
    hpo_pareto_mask = is_pareto_efficient(hpo_trials_df)
    
    # Extract Pareto-optimal rows
    hpo_pareto_df = hpo_trials_df[hpo_pareto_mask].reset_index(drop=True)
   
    hpo_configs_to_refit = hpo_pareto_df[hyperparam_keys].to_dict(orient='records')

    print(hpo_trials_df_filename, type(hpo_configs_to_refit), len(hpo_configs_to_refit))

except Exception as e:
    print(f'Error processing folder {hpo_trials_df_filename}. Reason: {str(e)}')

In [None]:
from pprint import pprint

pprint(hpo_configs_to_refit, indent=2)

# Step 8: Pipeline Reconstruction and Refit

In [None]:
from copy import deepcopy

def reconstruct_and_evaluate_pareto_model(best_params, pareto_idx):
    # Deep copy to avoid modifying original dict
    best_params_copy = deepcopy(best_params)

    # Extract balancing strategy
    balancing_mode = best_params_copy.pop("balancing_mode")
    sample_weights = None  # Default is no weighting

    # Handle each balancing mode

    if balancing_mode == "none":
        # No resampling or weighting
        X_train_final, y_train_final = X_train_filtered, y_train_filtered
    
    elif balancing_mode == "resampling":
        # Extract resampling params
        best_over_method = best_params_copy.pop("over_method")
        best_over_threshold = best_params_copy.pop("over_threshold")
        best_under_method = best_params_copy.pop("under_method")
        best_under_threshold = best_params_copy.pop("under_threshold")

        # Perform resampling
        X_train_final, y_train_final = fit_resample(
            X_train_filtered, y_train_filtered,
            best_over_method, best_over_threshold,
            best_under_method, best_under_threshold
        )

    elif balancing_mode == "weighting":
        # No resampling; just apply inverse frequency weighting
        X_train_final, y_train_final = X_train_filtered, y_train_filtered

        class_counts = y_train_final.value_counts()
        weight_map = 1.0 / class_counts
        sample_weights = y_train_final.map(weight_map)

    else:
        raise ValueError(f"Invalid balancing_mode: {balancing_mode}")

    # Train final model on full training set
    model_refit, train_time_refit, latency_refit, f1_weighted_refit, model_size_refit = train_xgb(
        X_train_final, X_val_reduced, X_test_reduced,
        y_train_final, y_val_sampled, y_test_sampled,
        sample_weights=sample_weights,
        cv=False,  # Final training, no CV
        custom_booster_params=best_params_copy,
        metrics_filename=f"{output_folder}/pareto_trials/xgb_hpo_pareto_{pareto_idx}_metrics.json",
        cm_filename=f"{output_folder}/pareto_trials/xgb_hpo_pareto_{pareto_idx}_cm.csv",
        model_filename=f"{output_folder}/pareto_trials/xgb_hpo_pareto_{pareto_idx}_model.json"
    )

    # Report
    print(f"Pareto Index = {pareto_idx} => Training Time: {train_time_refit:.3f} seconds")
    print(f"Pareto Index = {pareto_idx} => Latency: {latency_refit:.2e} seconds")
    print(f"Pareto Index = {pareto_idx} => Weighted F1-Score: {f1_weighted_refit:.6f}")
    print(f"Pareto Index = {pareto_idx} => Model Size: {model_size_refit} MB")

    return X_train_final, model_size_refit, train_time_refit, latency_refit, f1_weighted_refit, best_params

## Step 9: Pareto Front Evaluations (with Sampling, Feature Selection, Row Filtering, and HPO)

In [None]:
refit_trials = []

for pareto_idx, curr_params in enumerate(hpo_configs_to_refit):

    pareto_idx_str = str(pareto_idx).zfill(len(str(len(hpo_configs_to_refit))))
    
    print(pareto_idx_str, curr_params)
    
    X_train_resampled_refit, model_size_refit, train_time_refit, latency_refit, f1_weighted_refit, best_params_refit = \
        reconstruct_and_evaluate_pareto_model(curr_params, pareto_idx_str)
    
    refit_trials.append({
        'pareto_index': pareto_idx,
        'params': curr_params,
        'results': {
            'model_size': model_size_refit,
            'training_time': train_time_refit,
            'latency': latency_refit,
            'f1_weighted': f1_weighted_refit
        }
    })
    
    if pareto_idx == 2:
        break

In [None]:
refit_stats_df = pd.DataFrame.from_dict({x['pareto_index']: x['results'] for x in refit_trials}, orient='index')

refit_stats_df

In [None]:
refit_stats = refit_stats_df.agg(['min', 'max', 'mean', 'std']).to_dict()

pprint(refit_stats, indent=2)

In [None]:
refit_results_filename = os.path.join(output_folder, 'pareto_trials', f'summary.json')

os.makedirs(Path(refit_results_filename).parent, exist_ok=True)

refit_dict = {
    'trials': refit_trials,
    'stats': refit_stats
}

pprint(refit_dict, indent=2)

In [None]:
with open(refit_results_filename, 'w', encoding='utf-8') as f:
    json.dump(refit_dict, f, indent=2)

In [None]:
print(f"Global Time: {time.time() - global_start:.3f} seconds")