In [None]:
# Standard library
import json
import os
import random
import time

# Third-party libraries
import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

# Scikit-learn - core modules
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

# Scikit-learn - metrics
from sklearn.metrics import (
    accuracy_score, average_precision_score, balanced_accuracy_score,
    ConfusionMatrixDisplay, f1_score, log_loss,
    matthews_corrcoef, mean_squared_error, precision_score,
    PrecisionRecallDisplay, r2_score, recall_score, roc_auc_score, RocCurveDisplay
)

# Local application/library imports
from utils import load_search_space, get_model


## DATASET

In [None]:
SEED = 64

# Set random seeds
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [None]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'boston'        
dataset_subpath = 'Regression/boston'       
task_type = 'Regression'

In [None]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'connect-4'        
dataset_subpath = 'Multiclass/connect-4'       
task_type = 'Multiclass'

In [None]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'nomao'        
dataset_subpath = 'Binary/nomao'       
task_type = 'Binary'

In [None]:
df = pd.read_csv(f"./data/{dataset_subpath}/{dataset_name}.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
reduce = True if len(df) > 20000 else False

## LOAD AND PREPROCESS

In [None]:
def prepare_target_tensor(y, task):
    task = task.lower()
    if isinstance(y, pd.Series):
        y = y.to_numpy()
    elif isinstance(y, list):
        y = np.array(y)
        
    if task == "regression" or task == "binary":
        return torch.as_tensor(y, dtype=torch.float32).reshape(-1, 1)
    elif task == "multiclass":
        return torch.as_tensor(y, dtype=torch.long)
    else:
        raise ValueError(f"Unsupported task type: {task}")

In [None]:
def preprocess_data(df, dataset_name, task_type, model_type="default", seed=42):
    task_type = task_type.lower()
    model_type = model_type.lower()

    # Load config
    with open(f"./configs/preprocess/{dataset_name}.json") as f:
        config = json.load(f)

    categorical_cols = config["categorical_cols"]
    numerical_cols = config["numerical_cols"]
    encoding = config["encoding"]

    # Extract features and target
    X = df[numerical_cols + categorical_cols].copy()
    y = df.iloc[:, -1].copy()

    # Encode target if needed
    le = None
    if encoding.get("target") == "label":
        le = LabelEncoder()
        y = le.fit_transform(y)
        label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        label_mapping = None

    # Split raw data before transformation
    if task_type == "regression":
        # For regression, we can use a simple split
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=seed
        )
        X_val_raw, X_test_raw, y_val, y_test = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed
        )
    else:
        # For classification, we need stratified splits
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )
        X_val_raw, X_test_raw, y_val, y_test = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed, stratify=y_temp
        )

    # Ensure y_* are Series with index matching the X_*
    y_train = pd.Series(y_train, index=X_train_raw.index)
    y_val = pd.Series(y_val, index=X_val_raw.index)
    y_test = pd.Series(y_test, index=X_test_raw.index)

    # Compute class weights for classification
    class_weight = None
    if task_type in ["binary", "multiclass"]:
        class_weight_values = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
        # Create the class weight dictionary with keys as native Python int (not numpy.int32)
        class_weight = dict(zip([int(key) for key in np.unique(y_train)], class_weight_values))
        print(f"Class weights: {class_weight}")

    # CATBOOST path (no transformation, native categorical handling)
    if model_type == "catboost":
        for col in categorical_cols:
            X_train_raw[col] = X_train_raw[col].astype(str)
            X_val_raw[col] = X_val_raw[col].astype(str)
            X_test_raw[col] = X_test_raw[col].astype(str)
        print(f"Shapes — Train: {X_train_raw.shape}, Val: {X_val_raw.shape}, Test: {X_test_raw.shape}")
        print(f"Numerical features: {len(numerical_cols)} — {numerical_cols}")
        print(f"Categorical features: {len(categorical_cols)} — {categorical_cols}")
        print(f"Total features: {X_train_raw.shape[1]}")
        if label_mapping:
            print(f"Target label mapping: {label_mapping}")
        return (
            X_train_raw, X_val_raw, X_test_raw,
            y_train, y_val, y_test,
            categorical_cols, le, class_weight
        )
    
    if model_type == "lightgbm":
        for col in categorical_cols:
            X_train_raw[col] = X_train_raw[col].astype("category")
            X_val_raw[col] = X_val_raw[col].astype("category")
            X_test_raw[col] = X_test_raw[col].astype("category")
        print(f"Shapes — Train: {X_train_raw.shape}, Val: {X_val_raw.shape}, Test: {X_test_raw.shape}")
        print(f"Numerical features: {len(numerical_cols)} — {numerical_cols}")
        print(f"Categorical features: {len(categorical_cols)} — {categorical_cols}")
        print(f"Total features: {X_train_raw.shape[1]}")
        if label_mapping:
            print(f"Target label mapping: {label_mapping}")
        return (
            X_train_raw, X_val_raw, X_test_raw,
            y_train, y_val, y_test,
            categorical_cols, le, class_weight
        )

    # Transform numerical and categorical features
    transformers = []

    if encoding["numerical_features"] == "minmax":
        transformers.append(("num", MinMaxScaler(), numerical_cols))
    elif encoding["numerical_features"] == "standard":
        transformers.append(("num", StandardScaler(), numerical_cols))

    if categorical_cols and encoding["categorical_features"] == "onehot":
        transformers.append(("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols))

    if transformers:
        preprocessor = ColumnTransformer(transformers=transformers)
        X_train = preprocessor.fit_transform(X_train_raw)
        X_val = preprocessor.transform(X_val_raw)
        X_test = preprocessor.transform(X_test_raw)

        # Recover transformed column names
        if "cat" in preprocessor.named_transformers_:
            cat_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
            all_feature_names = numerical_cols + list(cat_feature_names)
        else:
            all_feature_names = numerical_cols + categorical_cols

        X_train = pd.DataFrame(X_train, columns=all_feature_names, index=X_train_raw.index)
        X_val = pd.DataFrame(X_val, columns=all_feature_names, index=X_val_raw.index)
        X_test = pd.DataFrame(X_test, columns=all_feature_names, index=X_test_raw.index)
    else:
        all_feature_names = numerical_cols + categorical_cols  # or keep original order
        X_train = pd.DataFrame(X_train_raw, columns=all_feature_names, index=X_train_raw.index)
        X_val = pd.DataFrame(X_val_raw, columns=all_feature_names, index=X_val_raw.index)
        X_test = pd.DataFrame(X_test_raw, columns=all_feature_names, index=X_test_raw.index)

    print(f"Shapes — Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
    print(f"Numerical features: {len(numerical_cols)} — {numerical_cols}")
    print(f"Categorical features: {len(categorical_cols)} — {categorical_cols}")
    print(f"Total features: {X_train.shape[1]}")
    if label_mapping:
        print(f"Target label mapping: {label_mapping}")

    return (
        X_train, X_val, X_test,
        y_train, y_val, y_test,
        None, le, class_weight
    )


In [None]:
def load_and_preprocess_data_deep(df, dataset_name, task_type, seed=42, batch_size=32, device='cpu'):
    task_type = task_type.lower()

    # Load config
    with open(f"./configs/preprocess/{dataset_name}.json") as f:
        config = json.load(f)

    categorical_cols = config["categorical_cols"]
    numerical_cols = config["numerical_cols"]
    encoding = config["encoding"]

    # Extract features and target
    X = df[numerical_cols + categorical_cols].copy()
    y = df.iloc[:, -1].copy()

    # Encode target if needed
    le = None
    if encoding.get("target") == "label":
        le = LabelEncoder()
        y = le.fit_transform(y)
        label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        label_mapping = None

    # Split raw data before transformation
    if task_type == "regression":
        # For regression, we can use a simple split
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=seed
        )
        X_val_raw, X_test_raw, y_val, y_test = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed
        )
    else:
        # For classification, we need stratified splits
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )
        X_val_raw, X_test_raw, y_val, y_test = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed, stratify=y_temp
        )

    # Compute class weights for classification
    class_weight = None
    if task_type in ["binary", "multiclass"]:
        # Compute raw weights
        class_weight_values = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
        classes_sorted = np.sort(np.unique(y_train))
        
        if task_type == "binary":
            # Compute pos_weight = weight for class 1 / weight for class 0
            weight_dict = dict(zip(classes_sorted, class_weight_values))
            pos_weight = weight_dict[1] / weight_dict[0]
            class_weight = torch.tensor(pos_weight, dtype=torch.float32).to(device)
            print(f"Binary pos_weight (for BCEWithLogitsLoss): {class_weight.item()}")

        elif task_type == "multiclass":
            class_weight = torch.tensor(class_weight_values, dtype=torch.float32).to(device)
            print(f"Multiclass class weights (for CrossEntropyLoss): {class_weight.tolist()}")

    # Transform numerical and categorical features
    transformers = []

    if encoding["numerical_features"] == "minmax":
        transformers.append(("num", MinMaxScaler(), numerical_cols))
    elif encoding["numerical_features"] == "standard":
        transformers.append(("num", StandardScaler(), numerical_cols))

    if categorical_cols and encoding["categorical_features"] == "onehot":
        transformers.append(("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols))

    if transformers:
        preprocessor = ColumnTransformer(transformers=transformers)
        X_train = preprocessor.fit_transform(X_train_raw)
        X_val = preprocessor.transform(X_val_raw)
        X_test = preprocessor.transform(X_test_raw)

        # Recover transformed column names
        if "cat" in preprocessor.named_transformers_:
            cat_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
            all_feature_names = numerical_cols + list(cat_feature_names)
        else:
            all_feature_names = numerical_cols + categorical_cols

        X_train_num = pd.DataFrame(X_train, columns=all_feature_names, index=X_train_raw.index)
        X_val_num = pd.DataFrame(X_val, columns=all_feature_names, index=X_val_raw.index)
        X_test_num = pd.DataFrame(X_test, columns=all_feature_names, index=X_test_raw.index)
    else:
        all_feature_names = numerical_cols + categorical_cols  # or keep original order
        X_train_num = pd.DataFrame(X_train_raw, columns=all_feature_names, index=X_train_raw.index)
        X_val_num = pd.DataFrame(X_val_raw, columns=all_feature_names, index=X_val_raw.index)
        X_test_num = pd.DataFrame(X_test_raw, columns=all_feature_names, index=X_test_raw.index)


    print(f"Shapes — Train: {X_train_num.shape}, Val: {X_val_num.shape}, Test: {X_test_num.shape}")
    print(f"Numerical features: {len(numerical_cols)} — {numerical_cols}")
    print(f"Categorical features: {len(categorical_cols)} — {categorical_cols}")
    print(f"Total features: {X_train_num.shape[1]}")
    if label_mapping:
        print(f"Target label mapping: {label_mapping}")
    

    attributes = len(X_train_num.columns)

    print("Attributes: ", attributes)
    # Convert data to PyTorch tensors
    X_train_num_tensor = torch.as_tensor(X_train_num.values, dtype=torch.float32)
    X_val_num_tensor = torch.as_tensor(X_val_num.values, dtype=torch.float32)
    X_test_num_tensor = torch.as_tensor(X_test_num.values, dtype=torch.float32)
    y_train_tensor = prepare_target_tensor(y_train, task_type)
    y_val_tensor = prepare_target_tensor(y_val, task_type)
    y_test_tensor = prepare_target_tensor(y_test, task_type)

    # Normalize to [0, 1]
    #X_train_img_tensor = X_train_img_tensor / 255.0
    #X_val_img_tensor = X_val_img_tensor / 255.0
    #X_test_img_tensor = X_test_img_tensor / 255.0

    # Create DataLoaders
    train_dataset = TensorDataset( X_train_num_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_num_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_num_tensor, y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    return train_loader, val_loader, test_loader, attributes,  le, class_weight

## COMPILE AND FIT

In [None]:
def objective(trial, name, task_type, 
              X_train, y_train, 
              X_val, y_val, 
              metric_name, categorical_cols = None, num_classes=None, SEED=42, device='cuda', save_dir=None, class_weight=None):
    params = load_search_space(name, trial)

    if name == "catboost":
        params["cat_features"] = categorical_cols

    model = get_model(name, params, task_type, num_classes, SEED, device, class_weight=class_weight)

    if name == "lightgbm":
        model.fit(X_train, y_train, categorical_feature=categorical_cols)
    else:
        model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # Choose metric based on task
    metric_name = metric_name.lower()
    if metric_name == "f1":
        score = f1_score(y_val, y_pred, average='macro')  # F1 Score
        metric_name = "F1"

    elif metric_name == "accuracy":
        score = accuracy_score(y_val, y_pred)  # Accuracy
        metric_name = "Accuracy"

    elif metric_name == "mse":
        score = mean_squared_error(y_val, y_pred)  # MSE
        metric_name = "MSE"

    elif metric_name == "rmse":
        score = np.sqrt(mean_squared_error(y_val, y_pred))  # RMSE
        metric_name = "RMSE"

    elif metric_name == "auc":
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_val)
            if y_proba.shape[1] == 2:
                # Binary classification
                y_score = y_proba[:, 1]
                score = roc_auc_score(y_val, y_score)
            else:
                # Multiclass
                score = roc_auc_score(y_val, y_proba, multi_class="ovr", average="macro")
        else:
            raise ValueError("Model does not support predict_proba, required for AUC.")
        metric_name = "AUC"

    save_dir = os.path.join(save_dir, name, "optuna")
    os.makedirs(save_dir, exist_ok=True)

    with open(f"{save_dir}/optuna_trials_log.txt", "a") as f:
        f.write(f"Trial {trial.number} - VAL-{metric_name}: {score:.4f}, Params: {params}\n")
        f.write("=" * 60 + "\n")

    return score

In [None]:
def evaluate_best_model(study, name, task_type,
                        X_train, y_train,
                        X_val, y_val,
                        X_test, y_test,
                        categorical_cols=None, num_classes=None, SEED=42, device='cuda',
                        save_dir=None, class_weight=None):

    best_params = study.best_params

    if name == "catboost":
        best_params["cat_features"] = categorical_cols

    model = get_model(name, best_params, task_type, num_classes, SEED, device, class_weight=class_weight)

    save_path = os.path.join(save_dir, name, "best_model")
    os.makedirs(save_path, exist_ok=True)

    # Train
    start_train = time.time()
    if name == "lightgbm":
        model.fit(X_train, y_train, categorical_feature=categorical_cols)
    else:
        model.fit(X_train, y_train)
    train_time = time.time() - start_train

    log = {"training_time_s": train_time}
    task_type = task_type.lower()

    def evaluate_split(X, y, split_key: str):
        start_pred = time.time()
        y_pred = model.predict(X)
        pred_time = time.time() - start_pred

        out = {
            f"{split_key}_inference_time_s": pred_time,
            f"{split_key}_accuracy": accuracy_score(y, y_pred),
            f"{split_key}_f1": f1_score(
                y, y_pred,
                average=("macro" if task_type == "multiclass" else "binary")
            ),
            f"{split_key}_recall": recall_score(
                y, y_pred,
                average=("macro" if task_type == "multiclass" else "binary")
            ),
            f"{split_key}_precision": precision_score(
                y, y_pred,
                average=("macro" if task_type == "multiclass" else "binary")
            ),
        }

        # Binary-only extras (probability-based + MCC)
        if task_type == "binary":
            if hasattr(model, "predict_proba"):
                y_prob = model.predict_proba(X)[:, 1]
                out[f"{split_key}_roc_auc"] = roc_auc_score(y, y_prob)
                out[f"{split_key}_avg_precision"] = average_precision_score(y, y_prob)
                out[f"{split_key}_log_loss"] = log_loss(y, y_prob)
            out[f"{split_key}_mcc"] = matthews_corrcoef(y, y_pred)

        # Console print with same snake_case keys
        print(f"\n--- {split_key} results ---")
        for k in sorted(out.keys()):
            print(f"{k}: {out[k]:.6f}")

        return out

    if task_type == "regression":
        def eval_reg(X, y, split_key: str):
            y_pred = model.predict(X)
            res = {
                f"{split_key}_mse": mean_squared_error(y, y_pred),
                f"{split_key}_rmse": np.sqrt(mean_squared_error(y, y_pred)),
                f"{split_key}_r2": r2_score(y, y_pred),
            }
            print(f"\n--- {split_key} regression results ---")
            for k in sorted(res.keys()):
                print(f"{k}: {res[k]:.6f}")
            return res

        log.update(eval_reg(X_train, y_train, "train"))
        log.update(eval_reg(X_val, y_val, "val"))
        log.update(eval_reg(X_test, y_test, "test"))

    else:  # binary or multiclass
        log.update(evaluate_split(X_train, y_train, "train"))
        log.update(evaluate_split(X_val, y_val, "val"))
        log.update(evaluate_split(X_test, y_test, "test"))

    # Save model + metrics + params
    model_file = os.path.join(save_path, "best_model.joblib")
    joblib.dump(model, model_file)

    log_file = os.path.join(save_path, "best_model_metrics.txt")
    with open(log_file, "w") as f:
        for k in sorted(log.keys()):
            v = log[k]
            f.write(f"{k}: {v:.6f}\n" if isinstance(v, (int, float, np.floating)) else f"{k}: {v}\n")

    params_file = os.path.join(save_path, "best_params.json")
    with open(params_file, "w") as f:
        json.dump(best_params, f, indent=4)

    print("\nBest Parameters:")
    print(best_params)
    print(f"\nModel saved: {model_file}")
    print(f"Metrics saved: {log_file}")
    return log


In [None]:
import random
import numpy as np
import torch

def set_model_seed(seed: int):
    # Python built-in RNG
    random.seed(seed)
    # NumPy RNG
    np.random.seed(seed)
    # Torch RNG
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you use multi-GPU
    
    # For reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## EXPERIMENTS

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
save_dir =  os.path.join("logs", task_type, dataset_name)

In [None]:
# Define the metric and direction based on task_type
if task_type.lower() == 'regression':
    metric_name = "RMSE"  # or any other regression metric
    direction = "minimize"  # Lower RMSE is better
elif task_type.lower() == 'binary':
    metric_name = "AUC"  # or any other binary classification metric
    direction = "maximize"  # Higher AUC is better
elif task_type.lower() == 'multiclass':
    metric_name = "Accuracy"  # or any other multiclass classification metric
    direction = "maximize"  # Higher accuracy is better
else:
    raise ValueError(f"Unknown task_type: {task_type}")

print(metric_name, direction)

if task_type.lower() == 'regression' or task_type.lower() == 'binary':
    num_classes = None
else:
    num_classes = df.iloc[:, -1].nunique()
    print(f"Number of classes: {num_classes}")

In [None]:
from torch.utils.data import DataLoader, Subset
import torch
import numpy as np

def reduce_dataloader(train_loader, fraction=0.25, stratify=True, seed=42):
    """
    Return a new DataLoader that draws from ~fraction of the original train dataset.
    For classification (TensorDataset(..., y)), uses a stratified subsample.
    """
    assert 0 < fraction <= 1.0
    ds = train_loader.dataset
    n = len(ds)
    num_keep = max(1, int(round(n * fraction)))
    idx = np.arange(n)

    # Try stratified pick if labels are available (TensorDataset last tensor is y)
    subset_idx = None
    if stratify and hasattr(ds, "tensors") and len(ds.tensors) >= 2:
        y = ds.tensors[-1].cpu().numpy().ravel()
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
            sss = StratifiedShuffleSplit(n_splits=1, train_size=fraction, random_state=seed)
            chosen, _ = next(sss.split(idx, y))
            subset_idx = idx[chosen]
        except Exception:
            subset_idx = None  # fallback to random below

    # Fallback: random subset with a fixed seed
    if subset_idx is None:
        g = torch.Generator().manual_seed(seed)
        subset_idx = torch.randperm(n, generator=g)[:num_keep].tolist()

    # Build subset dataset and a new DataLoader (reuse original loader settings)
    subset = Subset(ds, subset_idx)  # official Subset utility
    new_loader = DataLoader(
        subset,
        batch_size=train_loader.batch_size,
        shuffle=True,                               # shuffle within the subset
        num_workers=getattr(train_loader, "num_workers", 0),
        pin_memory=getattr(train_loader, "pin_memory", False),
        drop_last=getattr(train_loader, "drop_last", False),
        persistent_workers=getattr(train_loader, "persistent_workers", False),
    )
    return new_loader

In [None]:
import re
import pandas as pd

def _clean_name(s: str) -> str:
    s = str(s)
    s = re.sub(r"[\[\]<>]", "_", s)        # prohibidos por XGBoost
    s = re.sub(r"\s+", "_", s)             # espacios -> _
    s = re.sub(r"[^0-9a-zA-Z_]", "_", s)   # resto raro -> _
    s = re.sub(r"_+", "_", s).strip("_")   # compactar
    return s or "col"

def sanitize_after_preprocess(X_train, X_val, X_test, categorical_cols=None):
    # si no son DataFrames (p. ej., numpy), no hay nombres que sanear
    if not hasattr(X_train, "columns"):
        return X_train, X_val, X_test, categorical_cols, {}

    # --- 1) construir mapping viejo->nuevo desde TRAIN (con unicidad) ---
    old_cols = list(map(str, X_train.columns))
    base = [_clean_name(c) for c in old_cols]

    seen = {}
    new_cols = []
    for c in base:
        if c in seen:
            k = seen[c]
            seen[c] = k + 1
            new_cols.append(f"{c}__{k}")
        else:
            seen[c] = 1
            new_cols.append(c)

    colmap = dict(zip(old_cols, new_cols))

    # --- 2) renombrar TRAIN ---
    X_train = X_train.copy()
    X_train.columns = new_cols

    # --- 3) función para VAL / TEST: mapear, limpiar los no mapeados y realinear ---
    def _apply_to_split(df):
        if not hasattr(df, "columns"):
            return df  # ej. numpy
        df = df.copy()
        raw_cols = list(map(str, df.columns))
        mapped = [colmap.get(c, _clean_name(c)) for c in raw_cols]
        df.columns = mapped
        # realinear al orden/ set de TRAIN; faltantes -> 0
        df = df.reindex(columns=new_cols, fill_value=0)
        return df

    X_val  = _apply_to_split(X_val)
    X_test = _apply_to_split(X_test)

    # --- 4) actualizar categorical_cols (si procede) ---
    updated_cats = categorical_cols
    if categorical_cols is not None:
        if len(categorical_cols) > 0:
            if isinstance(categorical_cols[0], str):
                # nombres -> mapear por colmap (o limpiar si no estaba en train)
                updated_cats = [colmap.get(c, _clean_name(c)) for c in categorical_cols]
                # mantener solo las que existen en train tras reindex
                updated_cats = [c for c in updated_cats if c in new_cols]
            else:
                # índices -> convertir a nombres ya saneados
                updated_cats = [new_cols[i] for i in categorical_cols if 0 <= i < len(new_cols)]

    return X_train, X_val, X_test, updated_cats, colmap


In [None]:
def get_subset_indices_from_loader(loader) -> np.ndarray:
    """
    Return indices into the ORIGINAL base dataset used by this loader.
    Works for nested torch.utils.data.Subset(Subset(...)).
    Order is preserved exactly as stored in the Subset.
    """
    ds = loader.dataset
    idx = None  # indices into the current ds
    while isinstance(ds, Subset):
        cur = np.asarray(ds.indices)
        idx = cur if idx is None else cur[idx]  # compose through nesting
        ds = ds.dataset
    if idx is None:
        idx = np.arange(len(ds))
    return idx

## XGBoost

In [None]:
save_dir

In [None]:
train_loader, val_loader, test_loader, attributes, label_encoder, class_weight  = load_and_preprocess_data_deep(df, dataset_name, task_type, seed=SEED, device=device)

In [None]:
new_loader = reduce_dataloader(train_loader)

In [None]:
subset_idx = get_subset_indices_from_loader(new_loader)

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, label_encoder, class_weight = preprocess_data(df, dataset_name=dataset_name, task_type=task_type, model_type="xgboost", seed=SEED)

In [None]:
# 2) Sanea NOMBRES después del preprocess
X_train, X_val, X_test, categorical_cols, colmap = sanitize_after_preprocess(
    X_train, X_val, X_test, categorical_cols
)

# (opcional) ver cambios
changed = {k: v for k, v in colmap.items() if k != v}
print(f"Saneadas {len(changed)} columnas problemáticas." if changed else "No había columnas problemáticas.")

In [None]:
X_train_red = X_train.iloc[subset_idx].reset_index(drop=True)
y_train_red = y_train.iloc[subset_idx].reset_index(drop=True)

In [None]:
study = optuna.create_study(direction=direction)
study.optimize(lambda trial: objective(
    trial=trial,
    name="xgboost",
    task_type=task_type,
    num_classes=num_classes,
    X_train =X_train_red if reduce else X_train,
    y_train = y_train, 
    X_val=X_val,
    y_val=y_val,
    metric_name=metric_name,
    SEED=SEED,
    device=device,
    save_dir=save_dir,
    class_weight=None,
), n_trials=100)

# Print best result summary
best_trial = study.best_trial
print(f"\nBest Trial: {best_trial.number}")
print(f"  Score: {best_trial.value:.4f}")
print("  Best Hyperparameters:")
for k, v in best_trial.params.items():
    print(f"    {k}: {v}")

In [None]:
from numbers import Number

# seeds & aggregation
model_seeds = [0, 1, 2, 3, 4]
numeric_keys = None
per_seed_metrics = []

# Where to save the summary file (the same folder you used before)
summary_dir = os.path.join(save_dir, f"xgboost/best_model")
os.makedirs(summary_dir, exist_ok=True)
out_file = os.path.join(summary_dir, "best_results_mean.txt")

for s in model_seeds:
    set_model_seed(s)  # your util to set np/torch/python seeds if needed
    metrics = evaluate_best_model(
        study,                 # Optuna study with .best_params
        "xgboost",                  # "xgboost" / "lightgbm" / "catboost" / etc.
        task_type,
        X_train, y_train,
        X_val, y_val,
        X_test, y_test,
        categorical_cols=categorical_cols,
        num_classes=num_classes,
        SEED=s,                # <<< pass the seed into the model
        device=device,
        save_dir=save_dir,
        class_weight=None
    )
    if not isinstance(metrics, dict):
        raise TypeError(f"evaluate_best_model must return dict, got {type(metrics)}")

    if numeric_keys is None:
        numeric_keys = [k for k, v in metrics.items() if isinstance(v, (Number, np.floating, np.integer))]
    per_seed_metrics.append(metrics)

    # brief print
    brief = ", ".join(f"{k}={float(metrics[k]):.6f}" for k in numeric_keys[:6])
    print(f"Seed {s}: {brief}")

# Aggregate mean/std
aggregates = {}
for k in numeric_keys:
    vals = [float(m[k]) for m in per_seed_metrics]
    mean_k = float(np.mean(vals))
    std_k  = float(np.std(vals, ddof=1)) if len(vals) > 1 else 0.0
    aggregates[k] = {"mean": mean_k, "std": std_k}

# Save YAML-like txt (same style as your example)
with open(out_file, "w", encoding="utf-8") as f:
    f.write("# Best trial re-evaluation across model seeds\n")
    # If you really want to include patch_size (only exists for ViT/CNN), guard it:
    if hasattr(study, "best_params") and "patch_size" in study.best_params:
        f.write(f"patch_size: {study.best_params['patch_size']}\n")
    f.write(f"seeds: {model_seeds}\n")
    f.write("per_seed_metrics:\n")
    for s, m in zip(model_seeds, per_seed_metrics):
        f.write(f"  - seed: {s}\n")
        for k in numeric_keys:
            f.write(f"      {k}: {float(m[k]):.6f}\n")
    f.write("aggregates:\n")
    for k, mm in aggregates.items():
        f.write(f"  {k}:\n")
        f.write(f"    mean: {mm['mean']:.6f}\n")
        f.write(f"    std: {mm['std']:.6f}\n")

# Console summary (pick a sensible key)
pref_key = None
if task_type.lower() == "binary":
    for cand in ["Test AUC", "Test Accuracy", "Val AUC", "Val Accuracy"]:
        if cand in aggregates: pref_key = cand; break
elif task_type.lower() == "multiclass":
    for cand in ["Test Accuracy", "Val Accuracy", "Test F1", "Val F1"]:
        if cand in aggregates: pref_key = cand; break
else:  # regression
    for cand in ["Test RMSE", "Val RMSE", "Test R2", "Val R2"]:
        if cand in aggregates: pref_key = cand; break

if pref_key:
    print(f"→ xgboost: {pref_key} = {aggregates[pref_key]['mean']:.6f} ± {aggregates[pref_key]['std']:.6f}")
print(f"Saved to: {out_file}")


## Catboost

In [None]:
save_dir

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, label_encoder, class_weight = preprocess_data(df, dataset_name=dataset_name, task_type=task_type, model_type="catboost", seed=SEED)

In [None]:
# Load config
with open(f"./configs/preprocess/{dataset_name}.json") as f:
    config = json.load(f)

categorical_cols = config["categorical_cols"]

In [None]:
# 2) Sanea NOMBRES después del preprocess
X_train, X_val, X_test, categorical_cols, colmap = sanitize_after_preprocess(
    X_train, X_val, X_test, categorical_cols
)

# (opcional) ver cambios
changed = {k: v for k, v in colmap.items() if k != v}
print(f"Saneadas {len(changed)} columnas problemáticas." if changed else "No había columnas problemáticas.")

In [None]:
# 2) Sanea NOMBRES después del preprocess
X_train, X_val, X_test, categorical_cols, colmap = sanitize_after_preprocess(
    X_train, X_val, X_test, categorical_cols
)

# (opcional) ver cambios
changed = {k: v for k, v in colmap.items() if k != v}
print(f"Saneadas {len(changed)} columnas problemáticas." if changed else "No había columnas problemáticas.")

In [None]:
X_train_red = X_train.iloc[subset_idx].reset_index(drop=True)
y_train_red = y_train.iloc[subset_idx].reset_index(drop=True)

In [None]:
study = optuna.create_study(direction=direction)
study.optimize(lambda trial: objective(
    trial=trial,
    name="catboost",
    task_type=task_type,
    num_classes=num_classes,
    X_train =X_train_red if reduce else X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    metric_name=metric_name,
    SEED=SEED,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    categorical_cols=categorical_cols
), n_trials=100)

# Print best result summary
best_trial = study.best_trial
print(f"\nBest Trial: {best_trial.number}")
print(f"  AUC Score: {best_trial.value:.4f}")
print("  Best Hyperparameters:")
for k, v in best_trial.params.items():
    print(f"    {k}: {v}")

In [None]:
from numbers import Number

# seeds & aggregation
model_seeds = [0, 1, 2, 3, 4]
numeric_keys = None
per_seed_metrics = []

# Where to save the summary file (the same folder you used before)
summary_dir = os.path.join(save_dir, f"catboost/best_model")
os.makedirs(summary_dir, exist_ok=True)
out_file = os.path.join(summary_dir, "best_results_mean.txt")

for s in model_seeds:
    set_model_seed(s)  # your util to set np/torch/python seeds if needed
    metrics = evaluate_best_model(
        study,                 # Optuna study with .best_params
        "catboost",                  # "xgboost" / "lightgbm" / "catboost" / etc.
        task_type,
        X_train, y_train,
        X_val, y_val,
        X_test, y_test,
        categorical_cols=categorical_cols,
        num_classes=num_classes,
        SEED=s,                # <<< pass the seed into the model
        device=device,
        save_dir=save_dir,
        class_weight=None
    )
    if not isinstance(metrics, dict):
        raise TypeError(f"evaluate_best_model must return dict, got {type(metrics)}")

    if numeric_keys is None:
        numeric_keys = [k for k, v in metrics.items() if isinstance(v, (Number, np.floating, np.integer))]
    per_seed_metrics.append(metrics)

    # brief print
    brief = ", ".join(f"{k}={float(metrics[k]):.6f}" for k in numeric_keys[:6])
    print(f"Seed {s}: {brief}")

# Aggregate mean/std
aggregates = {}
for k in numeric_keys:
    vals = [float(m[k]) for m in per_seed_metrics]
    mean_k = float(np.mean(vals))
    std_k  = float(np.std(vals, ddof=1)) if len(vals) > 1 else 0.0
    aggregates[k] = {"mean": mean_k, "std": std_k}

# Save YAML-like txt (same style as your example)
with open(out_file, "w", encoding="utf-8") as f:
    f.write("# Best trial re-evaluation across model seeds\n")
    # If you really want to include patch_size (only exists for ViT/CNN), guard it:
    if hasattr(study, "best_params") and "patch_size" in study.best_params:
        f.write(f"patch_size: {study.best_params['patch_size']}\n")
    f.write(f"seeds: {model_seeds}\n")
    f.write("per_seed_metrics:\n")
    for s, m in zip(model_seeds, per_seed_metrics):
        f.write(f"  - seed: {s}\n")
        for k in numeric_keys:
            f.write(f"      {k}: {float(m[k]):.6f}\n")
    f.write("aggregates:\n")
    for k, mm in aggregates.items():
        f.write(f"  {k}:\n")
        f.write(f"    mean: {mm['mean']:.6f}\n")
        f.write(f"    std: {mm['std']:.6f}\n")

# Console summary (pick a sensible key)
pref_key = None
if task_type.lower() == "binary":
    for cand in ["Test AUC", "Test Accuracy", "Val AUC", "Val Accuracy"]:
        if cand in aggregates: pref_key = cand; break
elif task_type.lower() == "multiclass":
    for cand in ["Test Accuracy", "Val Accuracy", "Test F1", "Val F1"]:
        if cand in aggregates: pref_key = cand; break
else:  # regression
    for cand in ["Test RMSE", "Val RMSE", "Test R2", "Val R2"]:
        if cand in aggregates: pref_key = cand; break

if pref_key:
    print(f"→ catboost: {pref_key} = {aggregates[pref_key]['mean']:.6f} ± {aggregates[pref_key]['std']:.6f}")
print(f"Saved to: {out_file}")


## LightGBM

In [None]:
save_dir

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, label_encoder, class_weight = preprocess_data(df, dataset_name=dataset_name, task_type=task_type, model_type="lightgbm", seed=SEED)

In [None]:
# Load config
with open(f"./configs/preprocess/{dataset_name}.json") as f:
    config = json.load(f)

categorical_cols = config["categorical_cols"]

In [None]:
# 2) Sanea NOMBRES después del preprocess
X_train, X_val, X_test, categorical_cols, colmap = sanitize_after_preprocess(
    X_train, X_val, X_test, categorical_cols
)

# (opcional) ver cambios
changed = {k: v for k, v in colmap.items() if k != v}
print(f"Saneadas {len(changed)} columnas problemáticas." if changed else "No había columnas problemáticas.")

In [None]:
# 2) Sanea NOMBRES después del preprocess
X_train, X_val, X_test, categorical_cols, colmap = sanitize_after_preprocess(
    X_train, X_val, X_test, categorical_cols
)

# (opcional) ver cambios
changed = {k: v for k, v in colmap.items() if k != v}
print(f"Saneadas {len(changed)} columnas problemáticas." if changed else "No había columnas problemáticas.")

In [None]:
X_train_red = X_train.iloc[subset_idx].reset_index(drop=True)
y_train_red = y_train.iloc[subset_idx].reset_index(drop=True)

In [None]:
study = optuna.create_study(direction=direction)
study.optimize(lambda trial: objective(
    trial=trial,
    name="lightgbm",
    task_type=task_type,
    num_classes=num_classes,
    X_train =X_train_red if reduce else X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    metric_name=metric_name,
    SEED=SEED,
    device=device,
    save_dir=save_dir,
    class_weight=None,
    categorical_cols=categorical_cols
), n_trials=100)

# Print best result summary
best_trial = study.best_trial
print(f"\nBest Trial: {best_trial.number}")
print(f"  AUC Score: {best_trial.value:.4f}")
print("  Best Hyperparameters:")
for k, v in best_trial.params.items():
    print(f"    {k}: {v}")

In [None]:
from numbers import Number

# seeds & aggregation
model_seeds = [0, 1, 2, 3, 4]
numeric_keys = None
per_seed_metrics = []

# Where to save the summary file (the same folder you used before)
summary_dir = os.path.join(save_dir, f"lightgbm/best_model")
os.makedirs(summary_dir, exist_ok=True)
out_file = os.path.join(summary_dir, "best_results_mean.txt")

for s in model_seeds:
    set_model_seed(s)  # your util to set np/torch/python seeds if needed
    metrics = evaluate_best_model(
        study,                 # Optuna study with .best_params
        "lightgbm",                  # "xgboost" / "lightgbm" / "catboost" / etc.
        task_type,
        X_train, y_train,
        X_val, y_val,
        X_test, y_test,
        categorical_cols=categorical_cols,
        num_classes=num_classes,
        SEED=s,                # <<< pass the seed into the model
        device=device,
        save_dir=save_dir,
        class_weight=None
    )
    if not isinstance(metrics, dict):
        raise TypeError(f"evaluate_best_model must return dict, got {type(metrics)}")

    if numeric_keys is None:
        numeric_keys = [k for k, v in metrics.items() if isinstance(v, (Number, np.floating, np.integer))]
    per_seed_metrics.append(metrics)

    # brief print
    brief = ", ".join(f"{k}={float(metrics[k]):.6f}" for k in numeric_keys[:6])
    print(f"Seed {s}: {brief}")

# Aggregate mean/std
aggregates = {}
for k in numeric_keys:
    vals = [float(m[k]) for m in per_seed_metrics]
    mean_k = float(np.mean(vals))
    std_k  = float(np.std(vals, ddof=1)) if len(vals) > 1 else 0.0
    aggregates[k] = {"mean": mean_k, "std": std_k}

# Save YAML-like txt (same style as your example)
with open(out_file, "w", encoding="utf-8") as f:
    f.write("# Best trial re-evaluation across model seeds\n")
    # If you really want to include patch_size (only exists for ViT/CNN), guard it:
    if hasattr(study, "best_params") and "patch_size" in study.best_params:
        f.write(f"patch_size: {study.best_params['patch_size']}\n")
    f.write(f"seeds: {model_seeds}\n")
    f.write("per_seed_metrics:\n")
    for s, m in zip(model_seeds, per_seed_metrics):
        f.write(f"  - seed: {s}\n")
        for k in numeric_keys:
            f.write(f"      {k}: {float(m[k]):.6f}\n")
    f.write("aggregates:\n")
    for k, mm in aggregates.items():
        f.write(f"  {k}:\n")
        f.write(f"    mean: {mm['mean']:.6f}\n")
        f.write(f"    std: {mm['std']:.6f}\n")

# Console summary (pick a sensible key)
pref_key = None
if task_type.lower() == "binary":
    for cand in ["Test AUC", "Test Accuracy", "Val AUC", "Val Accuracy"]:
        if cand in aggregates: pref_key = cand; break
elif task_type.lower() == "multiclass":
    for cand in ["Test Accuracy", "Val Accuracy", "Test F1", "Val F1"]:
        if cand in aggregates: pref_key = cand; break
else:  # regression
    for cand in ["Test RMSE", "Val RMSE", "Test R2", "Val R2"]:
        if cand in aggregates: pref_key = cand; break

if pref_key:
    print(f"→ lightgbm: {pref_key} = {aggregates[pref_key]['mean']:.6f} ± {aggregates[pref_key]['std']:.6f}")
print(f"Saved to: {out_file}")
