# LGBM — Dataset Reducido con Selección de Características (CV Permutation Importance)


## 1 — Imports, configuración y rutas

In [140]:
import json, os, warnings, re
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, recall_score, balanced_accuracy_score, confusion_matrix, precision_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Balanceo
from imblearn.over_sampling import SMOTENC, SMOTE

# LightGBM
from lightgbm import LGBMClassifier
try:
    from lightgbm import early_stopping, log_evaluation
    _LGBM_CB_OK = True
except Exception:
    _LGBM_CB_OK = False

# === Toggles de experimento ===
USE_MI_REDUCED     = False
USE_PCA_REDUCED    = False
USE_PERM_REDUCED   = True
PERM_N_REPEATS     = 20
PERM_TOPK          = None
PERM_MIN_DELTA     = 0.0

# --- Vista de reducción por L1 ---
USE_L1_REDUCED     = False
L1_C               = 0.8
L1_TOPK            = None
L1_MIN_ABS_COEF    = 1e-6
L1_STANDARDIZE     = True

MI_TOPK            = 30
PCA_NCOMP          = 50
USE_BALANCED_TRAIN = True
BALANCE_IN_CV      = True
RANDOM_STATE       = 42
DO_TUNE            = True
DO_CV_BASELINE     = True
DO_CV_TUNED        = True
CV_FOLDS           = 5

# === Nombres y rutas ===
ROOT = Path.cwd().parent

red_tag = "FULL"
if USE_MI_REDUCED:
    red_tag = f"REDUCED_MI{MI_TOPK}"
elif USE_PCA_REDUCED:
    red_tag = f"REDUCED_PCA{PCA_NCOMP}"
elif USE_PERM_REDUCED:
    red_tag = f"REDUCED_PERMcv_{PERM_TOPK if PERM_TOPK else 'thr'}"
elif USE_L1_REDUCED:
    red_tag = f"REDUCED_L1_{('top'+str(L1_TOPK)) if L1_TOPK is not None else 'thr'}"

bal_tag = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"

EXP_NAME   = f"LGBM_{red_tag}_{bal_tag}"
ARTIF_DIR  = ROOT / "artifacts" / EXP_NAME
OUT_RESULTS = ARTIF_DIR / "results"
OUT_FIGS    = ARTIF_DIR / "figs"
OUT_PREDS   = ARTIF_DIR / "preds"
OUT_PARAMS  = ARTIF_DIR / "best_params"
for p in [OUT_RESULTS, OUT_FIGS, OUT_PREDS, OUT_PARAMS]:
    p.mkdir(parents=True, exist_ok=True)

# Dataset preprocesado
DATA_DIR = ROOT / "preproc_datasets" / "full"

# === Helpers para SMOTENC con metadatos de features ===
def cat_indices_from_feature_names(names):
    """
    Devuelve los índices de columnas categóricas según convención de nombres.
    Se asume OHE con prefijo 'cat__' para variables categóricas.
    """
    return np.array([i for i, n in enumerate(names) if isinstance(n, str) and n.startswith("cat__")], dtype=int)

def resample_smote_nc(X, y, feat_names, seed=RANDOM_STATE):
    """
    Aplica SMOTENC usando índices categóricos derivados de feat_names.
    Si no hay columnas categóricas (p.ej., tras PCA), cae a SMOTE estándar.
    """
    cats = cat_indices_from_feature_names(feat_names)
    if cats.size > 0:
        sampler = SMOTENC(categorical_features=cats, random_state=seed)
    else:
        sampler = SMOTE(random_state=seed)
    return sampler.fit_resample(X, y)

print("Exp:", EXP_NAME)
print("DATA_DIR:", DATA_DIR)
print("ARTIF_DIR:", ARTIF_DIR)

Exp: LGBM_REDUCED_PERMcv_thr_SMOTENC
DATA_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/preproc_datasets/full
ARTIF_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/LGBM_REDUCED_PERMcv_thr_SMOTENC


## 2 — Carga de artefactos (X, y, features)

In [141]:
def load_xy_full(dir_full: Path):
    X_train = np.load(dir_full / "X_train_full.npy")
    X_val   = np.load(dir_full / "X_val_full.npy")
    X_test  = np.load(dir_full / "X_test_full.npy")

    y_train = pd.read_parquet(dir_full / "y_train.parquet")["Exited"].to_numpy()
    y_val   = pd.read_parquet(dir_full / "y_val.parquet")["Exited"].to_numpy()
    y_test  = pd.read_parquet(dir_full / "y_test.parquet")["Exited"].to_numpy()

    feat = pd.read_parquet(dir_full / "feature_names_full.parquet")["feature"].tolist()
    return X_train, y_train, X_val, y_val, X_test, y_test, feat

X_train, y_train, X_val, y_val, X_test, y_test, feature_names_all = load_xy_full(DATA_DIR)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y train/val/test:", y_train.shape, y_val.shape, y_test.shape)
print("n features:", len(feature_names_all))

Shapes: (6000, 15) (2000, 15) (2000, 15)
y train/val/test: (6000,) (2000,) (2000,)
n features: 15


## 3 — Métricas y selección de umbral

In [142]:
def pr_auc(y_true, y_proba): 
    return float(average_precision_score(y_true, y_proba))

def roc_auc(y_true, y_proba): 
    return float(roc_auc_score(y_true, y_proba))

def find_best_threshold(y_true, y_proba, metric="f1"):
    thr_grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for thr in thr_grid:
        y_pred = (y_proba >= thr).astype(int)
        if metric == "f1":
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == "recall":
            score = recall_score(y_true, y_pred, zero_division=0)
        else:
            raise ValueError("metric no soportada")
        if score > best_score:
            best_score, best_thr = score, thr
    return float(best_thr), float(best_score)

def compute_all_metrics(y_true, y_proba, thr):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "pr_auc": pr_auc(y_true, y_proba),
        "roc_auc": roc_auc(y_true, y_proba),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "bal_acc": balanced_accuracy_score(y_true, y_pred)
    }

def sanitize_params(p, use_balanced_train=True):
    p = dict(p)
    p.pop("verbose", None)
    p.setdefault("verbosity", -1)
    p["metric"] = "average_precision"
    if use_balanced_train:
        p["class_weight"] = None
    return p

def pred_proba_best(mdl, X):
    it = getattr(mdl, "best_iteration_", None)
    if it is not None:
        return mdl.predict_proba(X, num_iteration=it)[:, 1]
    return mdl.predict_proba(X)[:, 1]

## 4 — Reducción por MI Top-K y Permutation CV

In [143]:
def fit_mi_selector(X_tr, y_tr, topk=MI_TOPK, seed=RANDOM_STATE):
    mi = mutual_info_classif(X_tr, y_tr, random_state=seed)
    order = np.argsort(mi)[::-1][:min(topk, X_tr.shape[1])]
    return order, mi

def apply_keep_idx(X, keep_idx):
    return X[:, keep_idx] if keep_idx is not None else X

# --- Selección L1 (Lasso) con LogisticRegression ---
def fit_l1_selector(
    X, y, feat_names,
    C=L1_C,
    standardize=L1_STANDARDIZE,
    topk=L1_TOPK,
    min_abs_coef=L1_MIN_ABS_COEF,
    seed=RANDOM_STATE
):

    X = np.asarray(X); y = np.asarray(y).ravel()

    X_fit = X
    scaler = None
    if standardize:
        scaler = StandardScaler(with_mean=True, with_std=True)
        X_fit = scaler.fit_transform(X)

    logreg = LogisticRegression(
        penalty="l1", solver="liblinear", C=C,
        max_iter=2000, random_state=seed
    )
    logreg.fit(X_fit, y)
    coefs = logreg.coef_.ravel()
    abs_coefs = np.abs(coefs)

    if topk is not None:
        order = np.argsort(abs_coefs)[::-1]
        order = [i for i in order if abs_coefs[i] > min_abs_coef][:min(topk, X.shape[1])]
        keep_idx = np.array(order, dtype=int)
    else:
        keep_idx = np.where(abs_coefs > float(min_abs_coef))[0].astype(int)

    kept_names    = [feat_names[i] for i in keep_idx]
    dropped_idx   = np.array([i for i in range(len(feat_names)) if i not in set(keep_idx)], dtype=int)
    dropped_names = [feat_names[i] for i in dropped_idx]

    df_coef = (
        pd.DataFrame({"feature": feat_names, "coef": coefs, "abs_coef": abs_coefs})
        .sort_values("abs_coef", ascending=False)
        .reset_index(drop=True)
    )
    df_coef["kept"] = df_coef["feature"].isin(kept_names)

    stats = {
        "kept_names": kept_names,
        "dropped_names": dropped_names,
        "df": df_coef,
        "C": C,
        "standardize": bool(standardize),
        "topk": topk,
        "min_abs_coef": float(min_abs_coef),
    }
    return keep_idx, stats

def fit_perm_selector_cv(
    X, y, feat_names,
    k_folds=CV_FOLDS,
    n_repeats=PERM_N_REPEATS,
    seed=RANDOM_STATE,
    topk=PERM_TOPK,
    min_delta=PERM_MIN_DELTA
):

    X = np.asarray(X); y = np.asarray(y).ravel()
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)

    base_params = dict(
        random_state=seed, n_estimators=1200, learning_rate=0.05,
        num_leaves=63, n_jobs=-1, metric="average_precision", verbosity=-1
    )

    _fit_kwargs_local = {}
    _callbacks_local = []
    if _LGBM_CB_OK:
        _callbacks_local = [early_stopping(stopping_rounds=200), log_evaluation(period=50)]
    else:
        _fit_kwargs_local["early_stopping_rounds"] = 200

    imps = []

    for f, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        # === Balanceo con SMOTENC ===
        if USE_BALANCED_TRAIN and BALANCE_IN_CV:
            X_tr, y_tr = resample_smote_nc(X_tr, y_tr, feat_names, seed=seed)

        mdl = LGBMClassifier(**base_params)
        fit_call = dict(eval_set=[(X_va, y_va)], eval_metric="aucpr", **_fit_kwargs_local)
        if _LGBM_CB_OK:
            fit_call["callbacks"] = _callbacks_local
        mdl.fit(X_tr, y_tr, **fit_call)

        pi = permutation_importance(
            mdl, X_va, y_va,
            scoring="average_precision",
            n_repeats=n_repeats,
            random_state=seed + f
        )
        imps.append(pi.importances_mean)

    imp_mean = np.mean(np.vstack(imps), axis=0)
    order = np.argsort(imp_mean)[::-1]

    if topk is not None:
        keep_idx = order[:min(topk, X.shape[1])]
    else:
        keep_idx = np.array([i for i in order if imp_mean[i] > float(min_delta)], dtype=int)

    kept_names    = [feat_names[i] for i in keep_idx]
    dropped_idx   = np.array([i for i in range(len(feat_names)) if i not in set(keep_idx)], dtype=int)
    dropped_names = [feat_names[i] for i in dropped_idx]

    df_imp = (
        pd.DataFrame({"feature": feat_names, "imp_mean": imp_mean})
        .sort_values("imp_mean", ascending=False)
        .reset_index(drop=True)
    )
    df_imp["kept"] = df_imp["feature"].isin(kept_names)

    stats = {
        "imp_mean": imp_mean.tolist(),
        "kept_names": kept_names,
        "dropped_names": dropped_names,
        "df": df_imp
    }
    return keep_idx, stats

## 5 — Reducción por PCA

In [144]:
pca_model = None
def apply_pca_fit_transform(X_tr, X_va, X_te, n_comp=PCA_NCOMP, seed=RANDOM_STATE):
    global pca_model
    pca_model = PCA(n_components=n_comp, random_state=seed)
    Xtr = pca_model.fit_transform(X_tr)
    Xva = pca_model.transform(X_va)
    Xte = pca_model.transform(X_te)
    comp_names = [f"PC{i+1}" for i in range(pca_model.n_components_)]
    return Xtr, Xva, Xte, comp_names

## 6 — Hiperparámetros persistentes

In [145]:
VIEW_TAG = "FULL"
if USE_MI_REDUCED:   VIEW_TAG = f"REDUCED_MI{MI_TOPK}"
if USE_PCA_REDUCED:  VIEW_TAG = f"REDUCED_PCA{PCA_NCOMP}"
if USE_PERM_REDUCED: VIEW_TAG = f"REDUCED_PERMcv_{PERM_TOPK if PERM_TOPK else 'thr'}"
if USE_L1_REDUCED:   VIEW_TAG = f"REDUCED_L1_{('top'+str(L1_TOPK)) if L1_TOPK is not None else 'thr'}"

BAL_TAG  = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"
BEST_HP_FILE = OUT_PARAMS / f"BEST_LGBM_{VIEW_TAG}_{BAL_TAG}.json"

def get_lgbm_defaults(seed=RANDOM_STATE):
    mdl = LGBMClassifier(random_state=seed)
    return mdl.get_params()

def load_best_or_default():
    if BEST_HP_FILE.exists():
        try:
            best = json.loads(BEST_HP_FILE.read_text())
            print("[HP] Cargando mejores hiperparámetros previos:", BEST_HP_FILE.name)
            base = get_lgbm_defaults()
            base.update(best)
            return base, True
        except Exception as e:
            print("[HP] Aviso: no se pudo leer BEST (uso defaults).", e)
    print("[HP] Usando hiperparámetros DEFAULT.")
    return get_lgbm_defaults(), False

seed_params, loaded_best_flag = load_best_or_default()

[HP] Cargando mejores hiperparámetros previos: BEST_LGBM_REDUCED_PERMcv_thr_SMOTENC.json


## 7 — Construcción de matrices (aplica reducción + balanceo) y entrenamiento BASELINE

In [146]:
_fit_kwargs = {}
_callbacks = []
if _LGBM_CB_OK:
    _callbacks = [early_stopping(stopping_rounds=200), log_evaluation(period=50)]
else:
    _fit_kwargs["early_stopping_rounds"] = 200

seed_params = sanitize_params(seed_params, use_balanced_train=USE_BALANCED_TRAIN)

# === Aplicar reducción ===
keep_idx_global = None
feature_names_used = feature_names_all
X_train_fit, X_val_fit, X_test_fit = X_train, X_val, X_test

# MI
if USE_MI_REDUCED:
    keep_idx_global, _mi = fit_mi_selector(X_train, y_train, topk=MI_TOPK, seed=RANDOM_STATE)
    X_train_fit = apply_keep_idx(X_train, keep_idx_global)
    X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
    X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)
    feature_names_used = [feature_names_all[i] for i in keep_idx_global]

# PCA
if USE_PCA_REDUCED:
    X_train_fit, X_val_fit, X_test_fit, feature_names_used = apply_pca_fit_transform(
        X_train_fit, X_val_fit, X_test_fit, n_comp=PCA_NCOMP, seed=RANDOM_STATE
    )

# PERM-CV
perm_stats = None
if USE_PERM_REDUCED:
    keep_idx_global, perm_stats = fit_perm_selector_cv(
        X_train, y_train, feature_names_all,
        k_folds=CV_FOLDS, n_repeats=PERM_N_REPEATS,
        seed=RANDOM_STATE, topk=PERM_TOPK, min_delta=PERM_MIN_DELTA
    )
    X_train_fit = apply_keep_idx(X_train, keep_idx_global)
    X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
    X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)
    feature_names_used = [feature_names_all[i] for i in keep_idx_global]

    print(f"[PERM-CV] Kept: {len(feature_names_used)} | Dropped: {len(perm_stats['dropped_names'])}")
    if len(perm_stats['dropped_names']) > 0:
        print("[PERM-CV] Eliminados:", ", ".join(perm_stats['dropped_names']))

    perm_imp_path = OUT_RESULTS / f"{EXP_NAME}_perm_cv_importances.csv"
    perm_stats["df"].to_csv(perm_imp_path, index=False)
    print("【PERM-CV】Importancias CV guardadas en:", perm_imp_path.name)

# --- L1 (Logistic Regression Lasso) ---
l1_stats = None
if USE_L1_REDUCED:
    keep_idx_global, l1_stats = fit_l1_selector(
        X_train, y_train, feature_names_all,
        C=L1_C, standardize=L1_STANDARDIZE, topk=L1_TOPK,
        min_abs_coef=L1_MIN_ABS_COEF, seed=RANDOM_STATE
    )
    X_train_fit = apply_keep_idx(X_train, keep_idx_global)
    X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
    X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)
    feature_names_used = [feature_names_all[i] for i in keep_idx_global]

    print(f"[L1] Kept: {len(feature_names_used)} | Dropped: {len(l1_stats['dropped_names'])}")
    if len(l1_stats['dropped_names']) > 0:
        print("[L1] Eliminados:", ", ".join(l1_stats['dropped_names']))

    l1_coef_path = OUT_RESULTS / f"{EXP_NAME}_l1_coefs.csv"
    l1_stats["df"].to_csv(l1_coef_path, index=False)
    print("[L1] Coeficientes guardados en:", l1_coef_path.name)

# === Balanceo SOLO en train ===
X_train_final, y_train_final = X_train_fit, y_train
if USE_BALANCED_TRAIN:
    X_train_final, y_train_final = resample_smote_nc(
        X_train_fit, y_train, feature_names_used, seed=RANDOM_STATE
    )

# === Entrenamiento baseline ===
model = LGBMClassifier(**seed_params)
_fit_call = dict(
    eval_set=[(X_val_fit, y_val)],
    eval_metric="aucpr",
    **_fit_kwargs
)
if _LGBM_CB_OK:
    _fit_call["callbacks"] = _callbacks

model.fit(X_train_final, y_train_final, **_fit_call)

best_iter = getattr(model, "best_iteration_", None)
print(f"[BASELINE] best_iteration: {best_iter}")

# Validación y umbral baseline
proba_val = pred_proba_best(model, X_val_fit)
thr_val, best_f1_val = find_best_threshold(y_val, proba_val, metric="f1")
val_metrics = compute_all_metrics(y_val, proba_val, thr_val)
print(f"[BASELINE] Mejor umbral (val) por F1: {thr_val:.3f} | F1(val)={best_f1_val:.4f}")

# Guardar sets de features
feature_sets_path = OUT_RESULTS / f"{EXP_NAME}_feature_sets.json"
all_feats_to_save  = feature_names_all
used_feats_to_save = feature_names_used

meta_reduction = {
    "mi_topk": MI_TOPK if USE_MI_REDUCED else None,
    "pca_components": PCA_NCOMP if USE_PCA_REDUCED else None,
    "perm_cv": {
        "enabled": bool(USE_PERM_REDUCED),
        "n_repeats": PERM_N_REPEATS if USE_PERM_REDUCED else None,
        "topk": PERM_TOPK if USE_PERM_REDUCED else None,
        "min_delta": PERM_MIN_DELTA if USE_PERM_REDUCED else None
    },
    "l1": {
        "enabled": bool(USE_L1_REDUCED),
        "C": L1_C if USE_L1_REDUCED else None,
        "standardize": bool(L1_STANDARDIZE) if USE_L1_REDUCED else None,
        "topk": L1_TOPK if USE_L1_REDUCED else None,
        "min_abs_coef": L1_MIN_ABS_COEF if USE_L1_REDUCED else None
    }
}

dropped_from_perm = (perm_stats["dropped_names"] if (USE_PERM_REDUCED and perm_stats) else [])
dropped_from_l1   = (l1_stats["dropped_names"] if (USE_L1_REDUCED and l1_stats) else [])
to_dump = {
    "all_features": all_feats_to_save,
    "used_features": used_feats_to_save,
    "dropped_features": dropped_from_perm if USE_PERM_REDUCED else dropped_from_l1,
    "reduction": meta_reduction
}
with open(feature_sets_path, "w", encoding="utf-8") as f:
    json.dump(to_dump, f, indent=2, ensure_ascii=False)
print("[FEATURES] Guardado:", feature_sets_path.name)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.664295
[100]	valid_0's average_precision: 0.664763
[150]	valid_0's average_precision: 0.66073
[200]	valid_0's average_precision: 0.653373
[250]	valid_0's average_precision: 0.645301
Early stopping, best iteration is:
[63]	valid_0's average_precision: 0.667534
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.662392
[100]	valid_0's average_precision: 0.671843
[150]	valid_0's average_precision: 0.670953
[200]	valid_0's average_precision: 0.671373
[250]	valid_0's average_precision: 0.673164
[300]	valid_0's average_precision: 0.671891
[350]	valid_0's average_precision: 0.671958
[400]	valid_0's average_precision: 0.668125
Early stopping, best iteration is:
[209]	valid_0's average_precision: 0.674161
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.704574
[100]	valid_0's average_precision: 0.710299
[

## 8 — Optimización (Optuna) y modelo TUNED

In [147]:
tuned_model = None
if DO_TUNE:
    import optuna
    from optuna.samplers import TPESampler

    N_TRIALS = 40
    STUDY_NAME = f"LGBM_{VIEW_TAG}_{BAL_TAG}_AP"
    SAMPLER = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
    study = optuna.create_study(direction="maximize", study_name=STUDY_NAME, sampler=SAMPLER)

    def suggest_lgbm_params(trial):
        p = {}
        p["learning_rate"]     = trial.suggest_float("learning_rate", 0.005, 0.2, log=True)
        p["n_estimators"]      = trial.suggest_int("n_estimators", 800, 3000, step=50)
        p["num_leaves"]        = trial.suggest_int("num_leaves", 16, 256)
        p["max_depth"]         = trial.suggest_categorical("max_depth", [-1, 4, 6, 8, 10])
        p["min_child_samples"] = trial.suggest_int("min_child_samples", 10, 200)
        p["subsample"]         = trial.suggest_float("subsample", 0.6, 1.0)
        p["colsample_bytree"]  = trial.suggest_float("colsample_bytree", 0.6, 1.0)
        p["reg_alpha"]         = trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True)
        p["reg_lambda"]        = trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True)
        p["min_split_gain"]    = trial.suggest_float("min_split_gain", 0.0, 1.0)
        if USE_BALANCED_TRAIN:
            p["class_weight"] = None
        else:
            p["class_weight"] = trial.suggest_categorical("class_weight", [None, "balanced"])
        p["random_state"] = RANDOM_STATE
        p["n_jobs"] = -1
        p["metric"] = "average_precision"
        p["verbosity"] = -1
        return p

    def objective(trial):
        hp = suggest_lgbm_params(trial)
        hp = sanitize_params(hp, use_balanced_train=USE_BALANCED_TRAIN)

        mdl = LGBMClassifier(**hp)

        X_tr, y_tr = X_train_fit, y_train
        if USE_BALANCED_TRAIN:
            X_tr, y_tr = resample_smote_nc(X_tr, y_tr, feature_names_used, seed=RANDOM_STATE)

        fit_call = dict(
            eval_set=[(X_val_fit, y_val)],
            eval_metric="aucpr",
            **_fit_kwargs
        )
        if _LGBM_CB_OK:
            fit_call["callbacks"] = _callbacks

        mdl.fit(X_tr, y_tr, **fit_call)
        proba_val_t = pred_proba_best(mdl, X_val_fit)
        ap = average_precision_score(y_val, proba_val_t)
        trial.set_user_attr("best_iteration", getattr(mdl, "best_iteration_", None))
        return ap

    print(f"[OPTUNA] Iniciando '{STUDY_NAME}' con {N_TRIALS} pruebas…")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

    best = study.best_trial
    print(f"[OPTUNA] Mejor AP(val): {best.value:.6f}")
    print(f"[OPTUNA] Params:", best.params)
    print(f"[OPTUNA] best_iteration:", best.user_attrs.get("best_iteration"))

    best_params = dict(best.params)
    best_params.update({
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
        "metric": "average_precision",
        "verbosity": -1
    })
    with open(BEST_HP_FILE, "w", encoding="utf-8") as f:
        json.dump(best_params, f, indent=2, ensure_ascii=False)
    print("[OPTUNA] Guardado BEST en:", BEST_HP_FILE.name)

    tuned_model = LGBMClassifier(**best_params)
    fit_call = dict(
        eval_set=[(X_val_fit, y_val)],
        eval_metric="aucpr",
        **_fit_kwargs
    )
    if _LGBM_CB_OK:
        fit_call["callbacks"] = _callbacks
    tuned_model.fit(X_train_fit, y_train, **fit_call)
    print("[OPTUNA] Reentreno final completado. best_iteration_ =", getattr(tuned_model, "best_iteration_", None))

[I 2025-12-09 14:56:10,353] A new study created in memory with name: LGBM_REDUCED_PERMcv_thr_SMOTENC_AP


[OPTUNA] Iniciando 'LGBM_REDUCED_PERMcv_thr_SMOTENC_AP' con 40 pruebas…
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.656887
[100]	valid_0's average_precision: 0.667936
[150]	valid_0's average_precision: 0.672786
[200]	valid_0's average_precision: 0.672304
[250]	valid_0's average_precision: 0.672506


[I 2025-12-09 14:56:11,609] Trial 0 finished with value: 0.67292280168241 and parameters: {'learning_rate': 0.019906996673933378, 'n_estimators': 2900, 'num_leaves': 192, 'max_depth': 10, 'min_child_samples': 124, 'subsample': 0.8832290311184181, 'colsample_bytree': 0.608233797718321, 'reg_alpha': 7.072114131472227, 'reg_lambda': 1.452824663751602, 'min_split_gain': 0.21233911067827616}. Best is trial 0 with value: 0.67292280168241.


[300]	valid_0's average_precision: 0.671219
Early stopping, best iteration is:
[147]	valid_0's average_precision: 0.672923
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.675225
[100]	valid_0's average_precision: 0.677455
[150]	valid_0's average_precision: 0.678756
[200]	valid_0's average_precision: 0.678112
[250]	valid_0's average_precision: 0.67847
[300]	valid_0's average_precision: 0.679874
[350]	valid_0's average_precision: 0.678628
[400]	valid_0's average_precision: 0.680164
[450]	valid_0's average_precision: 0.680363
[500]	valid_0's average_precision: 0.680069
[550]	valid_0's average_precision: 0.680227
[600]	valid_0's average_precision: 0.680658
[650]	valid_0's average_precision: 0.6804


[I 2025-12-09 14:56:15,073] Trial 1 finished with value: 0.6808300053865901 and parameters: {'learning_rate': 0.009778325945801386, 'n_estimators': 1200, 'num_leaves': 89, 'max_depth': 8, 'min_child_samples': 65, 'subsample': 0.7465447373174767, 'colsample_bytree': 0.7824279936868144, 'reg_alpha': 0.8431013932082461, 'reg_lambda': 0.0009962513222055108, 'min_split_gain': 0.5142344384136116}. Best is trial 1 with value: 0.6808300053865901.


Early stopping, best iteration is:
[478]	valid_0's average_precision: 0.68083
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.673708
[100]	valid_0's average_precision: 0.680307
[150]	valid_0's average_precision: 0.682469
[200]	valid_0's average_precision: 0.681519


[I 2025-12-09 14:56:16,214] Trial 2 finished with value: 0.683173642108792 and parameters: {'learning_rate': 0.04446862319918233, 'n_estimators': 900, 'num_leaves': 162, 'max_depth': 8, 'min_child_samples': 68, 'subsample': 0.6390688456025535, 'colsample_bytree': 0.8736932106048627, 'reg_alpha': 0.015876781526923997, 'reg_lambda': 0.0004075596440072873, 'min_split_gain': 0.4951769101112702}. Best is trial 2 with value: 0.683173642108792.


[250]	valid_0's average_precision: 0.681223
[300]	valid_0's average_precision: 0.681223
[350]	valid_0's average_precision: 0.681223
Early stopping, best iteration is:
[164]	valid_0's average_precision: 0.683174
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.61085
[100]	valid_0's average_precision: 0.622754
[150]	valid_0's average_precision: 0.62653
[200]	valid_0's average_precision: 0.633417
[250]	valid_0's average_precision: 0.638399
[300]	valid_0's average_precision: 0.64813
[350]	valid_0's average_precision: 0.654051
[400]	valid_0's average_precision: 0.656545
[450]	valid_0's average_precision: 0.660008
[500]	valid_0's average_precision: 0.663241
[550]	valid_0's average_precision: 0.665466
[600]	valid_0's average_precision: 0.666128
[650]	valid_0's average_precision: 0.666496
[700]	valid_0's average_precision: 0.665368
[750]	valid_0's average_precision: 0.665726
[800]	valid_0's average_precision: 0.666724
[850]	valid_0's average_pre

[I 2025-12-09 14:56:20,501] Trial 3 finished with value: 0.667347735091636 and parameters: {'learning_rate': 0.005676262589955587, 'n_estimators': 2800, 'num_leaves': 78, 'max_depth': -1, 'min_child_samples': 195, 'subsample': 0.9100531293444458, 'colsample_bytree': 0.9757995766256756, 'reg_alpha': 2.9794544625913595, 'reg_lambda': 0.09761125443110447, 'min_split_gain': 0.9218742350231168}. Best is trial 2 with value: 0.683173642108792.


[1050]	valid_0's average_precision: 0.666347
Early stopping, best iteration is:
[850]	valid_0's average_precision: 0.667348
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.665205
[100]	valid_0's average_precision: 0.665719
[150]	valid_0's average_precision: 0.669953
[200]	valid_0's average_precision: 0.672271
[250]	valid_0's average_precision: 0.675474
[300]	valid_0's average_precision: 0.678542
[350]	valid_0's average_precision: 0.678116
[400]	valid_0's average_precision: 0.679332
[450]	valid_0's average_precision: 0.67826
[500]	valid_0's average_precision: 0.678109


[I 2025-12-09 14:56:22,588] Trial 4 finished with value: 0.6799171266602125 and parameters: {'learning_rate': 0.006930112765148064, 'n_estimators': 1200, 'num_leaves': 26, 'max_depth': 8, 'min_child_samples': 63, 'subsample': 0.8170784332632994, 'colsample_bytree': 0.6563696899899051, 'reg_alpha': 1.025616274847307, 'reg_lambda': 0.0002359137306347715, 'min_split_gain': 0.9868869366005173}. Best is trial 2 with value: 0.683173642108792.


[550]	valid_0's average_precision: 0.677899
Early stopping, best iteration is:
[393]	valid_0's average_precision: 0.679917
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.676482
[100]	valid_0's average_precision: 0.683742
[150]	valid_0's average_precision: 0.687423
[200]	valid_0's average_precision: 0.689882
[250]	valid_0's average_precision: 0.689047
[300]	valid_0's average_precision: 0.687756


[I 2025-12-09 14:56:23,559] Trial 5 finished with value: 0.6904714789188336 and parameters: {'learning_rate': 0.08632815369661433, 'n_estimators': 1200, 'num_leaves': 17, 'max_depth': -1, 'min_child_samples': 78, 'subsample': 0.6463476238100518, 'colsample_bytree': 0.9452413703502374, 'reg_alpha': 0.13076473382928538, 'reg_lambda': 0.004513257622008946, 'min_split_gain': 0.06355835028602363}. Best is trial 5 with value: 0.6904714789188336.


[350]	valid_0's average_precision: 0.688653
Early stopping, best iteration is:
[182]	valid_0's average_precision: 0.690471
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.625699
[100]	valid_0's average_precision: 0.648269
[150]	valid_0's average_precision: 0.654779
[200]	valid_0's average_precision: 0.66095
[250]	valid_0's average_precision: 0.664281
[300]	valid_0's average_precision: 0.666456
[350]	valid_0's average_precision: 0.66886
[400]	valid_0's average_precision: 0.670462
[450]	valid_0's average_precision: 0.672259
[500]	valid_0's average_precision: 0.672983
[550]	valid_0's average_precision: 0.672646
[600]	valid_0's average_precision: 0.672264
[650]	valid_0's average_precision: 0.672779
Early stopping, best iteration is:
[487]	valid_0's average_precision: 0.673356


[I 2025-12-09 14:56:24,805] Trial 6 finished with value: 0.6733557027230687 and parameters: {'learning_rate': 0.015746438450976667, 'n_estimators': 1500, 'num_leaves': 191, 'max_depth': 4, 'min_child_samples': 155, 'subsample': 0.8245108790277985, 'colsample_bytree': 0.9083868719818244, 'reg_alpha': 0.02944272359149678, 'reg_lambda': 0.04108318894699928, 'min_split_gain': 0.42754101835854963}. Best is trial 5 with value: 0.6904714789188336.


Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.664983
[100]	valid_0's average_precision: 0.663792
[150]	valid_0's average_precision: 0.664998
[200]	valid_0's average_precision: 0.666104
[250]	valid_0's average_precision: 0.669053
[300]	valid_0's average_precision: 0.673252
[350]	valid_0's average_precision: 0.676014
[400]	valid_0's average_precision: 0.678789
[450]	valid_0's average_precision: 0.678601
[500]	valid_0's average_precision: 0.679331
[550]	valid_0's average_precision: 0.678606
[600]	valid_0's average_precision: 0.677882
[650]	valid_0's average_precision: 0.677854


[I 2025-12-09 14:56:27,186] Trial 7 finished with value: 0.6795486470506802 and parameters: {'learning_rate': 0.005491525066424382, 'n_estimators': 1000, 'num_leaves': 23, 'max_depth': 8, 'min_child_samples': 88, 'subsample': 0.9022204554172195, 'colsample_bytree': 0.6915192661966489, 'reg_alpha': 0.00024260488932164486, 'reg_lambda': 0.0028103296447636083, 'min_split_gain': 0.16122128725400442}. Best is trial 5 with value: 0.6904714789188336.


Early stopping, best iteration is:
[493]	valid_0's average_precision: 0.679549
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.670363
[100]	valid_0's average_precision: 0.665787
[150]	valid_0's average_precision: 0.663544


[I 2025-12-09 14:56:27,628] Trial 8 finished with value: 0.6782979946585769 and parameters: {'learning_rate': 0.15431261011638706, 'n_estimators': 2600, 'num_leaves': 168, 'max_depth': 8, 'min_child_samples': 164, 'subsample': 0.9584365199693973, 'colsample_bytree': 0.7272013899887455, 'reg_alpha': 0.000355025561231308, 'reg_lambda': 0.001379354235277248, 'min_split_gain': 0.4271077886262563}. Best is trial 5 with value: 0.6904714789188336.


[200]	valid_0's average_precision: 0.663511
Early stopping, best iteration is:
[24]	valid_0's average_precision: 0.678298
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.663234


[I 2025-12-09 14:56:27,964] Trial 9 finished with value: 0.6671084907144434 and parameters: {'learning_rate': 0.10220655100897388, 'n_estimators': 2700, 'num_leaves': 17, 'max_depth': -1, 'min_child_samples': 190, 'subsample': 0.7292811728083021, 'colsample_bytree': 0.8075162486973464, 'reg_alpha': 0.3274121520988885, 'reg_lambda': 0.0065788201191231774, 'min_split_gain': 0.9717820827209607}. Best is trial 5 with value: 0.6904714789188336.


[100]	valid_0's average_precision: 0.65856
[150]	valid_0's average_precision: 0.656302
[200]	valid_0's average_precision: 0.656355
Early stopping, best iteration is:
[32]	valid_0's average_precision: 0.667108
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.674485
[100]	valid_0's average_precision: 0.669392


[I 2025-12-09 14:56:28,649] Trial 10 finished with value: 0.6770409673699735 and parameters: {'learning_rate': 0.11160520780208899, 'n_estimators': 1200, 'num_leaves': 25, 'max_depth': -1, 'min_child_samples': 154, 'subsample': 0.6304655048035832, 'colsample_bytree': 0.9486757327801338, 'reg_alpha': 0.4625809500171002, 'reg_lambda': 0.02958571414754742, 'min_split_gain': 0.24741350827669434}. Best is trial 5 with value: 0.6904714789188336.


[150]	valid_0's average_precision: 0.664465
[200]	valid_0's average_precision: 0.660765
Early stopping, best iteration is:
[42]	valid_0's average_precision: 0.677041
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.674665
[100]	valid_0's average_precision: 0.68448
[150]	valid_0's average_precision: 0.685068


[I 2025-12-09 14:56:29,103] Trial 11 finished with value: 0.6868155542420753 and parameters: {'learning_rate': 0.09050232916287854, 'n_estimators': 1550, 'num_leaves': 45, 'max_depth': 4, 'min_child_samples': 32, 'subsample': 0.6084041964777769, 'colsample_bytree': 0.9370913448713828, 'reg_alpha': 0.02268543822568673, 'reg_lambda': 0.004661751603072236, 'min_split_gain': 0.12963345766078527}. Best is trial 5 with value: 0.6904714789188336.


[200]	valid_0's average_precision: 0.68145
[250]	valid_0's average_precision: 0.678437
Early stopping, best iteration is:
[86]	valid_0's average_precision: 0.686816
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.681716
[100]	valid_0's average_precision: 0.683359
[150]	valid_0's average_precision: 0.682161


[I 2025-12-09 14:56:29,548] Trial 12 finished with value: 0.6857966622054906 and parameters: {'learning_rate': 0.12639865324310517, 'n_estimators': 1150, 'num_leaves': 43, 'max_depth': 4, 'min_child_samples': 49, 'subsample': 0.7038072679919442, 'colsample_bytree': 0.9797447789375463, 'reg_alpha': 0.0698312618749519, 'reg_lambda': 0.001149422634021616, 'min_split_gain': 0.09326545536393184}. Best is trial 5 with value: 0.6904714789188336.


[200]	valid_0's average_precision: 0.684234
[250]	valid_0's average_precision: 0.678251
Early stopping, best iteration is:
[86]	valid_0's average_precision: 0.685797
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.659472
[100]	valid_0's average_precision: 0.68058
[150]	valid_0's average_precision: 0.686381
[200]	valid_0's average_precision: 0.685544
[250]	valid_0's average_precision: 0.685411
[300]	valid_0's average_precision: 0.686022


[I 2025-12-09 14:56:30,471] Trial 13 finished with value: 0.6871654919048742 and parameters: {'learning_rate': 0.04060393986072223, 'n_estimators': 1950, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 32, 'subsample': 0.647909046328907, 'colsample_bytree': 0.8915277935520681, 'reg_alpha': 0.392721170230632, 'reg_lambda': 0.012181101841488252, 'min_split_gain': 0.12464108666693852}. Best is trial 5 with value: 0.6904714789188336.


Early stopping, best iteration is:
[144]	valid_0's average_precision: 0.687165
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.673699
[100]	valid_0's average_precision: 0.67615
[150]	valid_0's average_precision: 0.674994
[200]	valid_0's average_precision: 0.676257
[250]	valid_0's average_precision: 0.676517
[300]	valid_0's average_precision: 0.676491
[350]	valid_0's average_precision: 0.675421
[400]	valid_0's average_precision: 0.67414


[I 2025-12-09 14:56:34,600] Trial 14 finished with value: 0.6776653550556833 and parameters: {'learning_rate': 0.022270028961746353, 'n_estimators': 1000, 'num_leaves': 74, 'max_depth': -1, 'min_child_samples': 69, 'subsample': 0.619310761070061, 'colsample_bytree': 0.7883925500787337, 'reg_alpha': 0.5132368222274102, 'reg_lambda': 0.0002344190516496126, 'min_split_gain': 0.05019731447193783}. Best is trial 5 with value: 0.6904714789188336.


[450]	valid_0's average_precision: 0.674772
Early stopping, best iteration is:
[266]	valid_0's average_precision: 0.677665
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.664328
[100]	valid_0's average_precision: 0.666878
[150]	valid_0's average_precision: 0.667738
[200]	valid_0's average_precision: 0.670713
[250]	valid_0's average_precision: 0.668656
[300]	valid_0's average_precision: 0.670668
[350]	valid_0's average_precision: 0.671999
[400]	valid_0's average_precision: 0.675145
[450]	valid_0's average_precision: 0.677562
[500]	valid_0's average_precision: 0.679818
[550]	valid_0's average_precision: 0.681138
[600]	valid_0's average_precision: 0.682062
[650]	valid_0's average_precision: 0.683001
[700]	valid_0's average_precision: 0.684138
[750]	valid_0's average_precision: 0.68507
[800]	valid_0's average_precision: 0.685369
[850]	valid_0's average_precision: 0.686003
[900]	valid_0's average_precision: 0.686692
[950]	valid_0's average_p

[I 2025-12-09 14:56:38,705] Trial 15 finished with value: 0.6870217441971062 and parameters: {'learning_rate': 0.00860775338716246, 'n_estimators': 1700, 'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 10, 'subsample': 0.7053303159597494, 'colsample_bytree': 0.8420647314193398, 'reg_alpha': 0.0920373895862211, 'reg_lambda': 0.14810462114524472, 'min_split_gain': 0.22262994600226338}. Best is trial 5 with value: 0.6904714789188336.


[1100]	valid_0's average_precision: 0.686224
Early stopping, best iteration is:
[906]	valid_0's average_precision: 0.687022
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.663291
[100]	valid_0's average_precision: 0.66921
[150]	valid_0's average_precision: 0.676035
[200]	valid_0's average_precision: 0.67928
[250]	valid_0's average_precision: 0.677498
[300]	valid_0's average_precision: 0.67644


[I 2025-12-09 14:56:39,875] Trial 16 finished with value: 0.6794108064468697 and parameters: {'learning_rate': 0.034903939407243484, 'n_estimators': 2100, 'num_leaves': 98, 'max_depth': 6, 'min_child_samples': 88, 'subsample': 0.8497699881688667, 'colsample_bytree': 0.8085674487212147, 'reg_alpha': 2.3232623729309125, 'reg_lambda': 0.0033555159173703285, 'min_split_gain': 0.09844044462378902}. Best is trial 5 with value: 0.6904714789188336.


[350]	valid_0's average_precision: 0.675074
[400]	valid_0's average_precision: 0.674268
Early stopping, best iteration is:
[210]	valid_0's average_precision: 0.679411
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.661597
[100]	valid_0's average_precision: 0.66815
[150]	valid_0's average_precision: 0.670419
[200]	valid_0's average_precision: 0.673779
[250]	valid_0's average_precision: 0.677757
[300]	valid_0's average_precision: 0.680064
[350]	valid_0's average_precision: 0.680482
[400]	valid_0's average_precision: 0.67851
[450]	valid_0's average_precision: 0.67875


[I 2025-12-09 14:56:42,917] Trial 17 finished with value: 0.681528378374409 and parameters: {'learning_rate': 0.021214812537338153, 'n_estimators': 850, 'num_leaves': 43, 'max_depth': -1, 'min_child_samples': 69, 'subsample': 0.7986595709599665, 'colsample_bytree': 0.9480772725922231, 'reg_alpha': 0.14295180538223412, 'reg_lambda': 0.23112454254783893, 'min_split_gain': 0.19297865577099715}. Best is trial 5 with value: 0.6904714789188336.


[500]	valid_0's average_precision: 0.678898
Early stopping, best iteration is:
[327]	valid_0's average_precision: 0.681528
Training until validation scores don't improve for 200 rounds


[I 2025-12-09 14:56:43,197] Trial 18 finished with value: 0.681881578453229 and parameters: {'learning_rate': 0.12029985030921832, 'n_estimators': 1950, 'num_leaves': 67, 'max_depth': 6, 'min_child_samples': 35, 'subsample': 0.6758265032585943, 'colsample_bytree': 0.8016858795613105, 'reg_alpha': 6.166588044154373, 'reg_lambda': 0.00012934972189384303, 'min_split_gain': 0.4207158737966231}. Best is trial 5 with value: 0.6904714789188336.


[50]	valid_0's average_precision: 0.680041
[100]	valid_0's average_precision: 0.68012
[150]	valid_0's average_precision: 0.68012
[200]	valid_0's average_precision: 0.68012
[250]	valid_0's average_precision: 0.68012
Early stopping, best iteration is:
[62]	valid_0's average_precision: 0.681882
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.667971
[100]	valid_0's average_precision: 0.681096
[150]	valid_0's average_precision: 0.680174
[200]	valid_0's average_precision: 0.679383


[I 2025-12-09 14:56:44,100] Trial 19 finished with value: 0.68234787952074 and parameters: {'learning_rate': 0.051172893246778116, 'n_estimators': 2300, 'num_leaves': 97, 'max_depth': 6, 'min_child_samples': 60, 'subsample': 0.6008372141641575, 'colsample_bytree': 0.8858914607898476, 'reg_alpha': 0.4231191251291723, 'reg_lambda': 0.06192882085684199, 'min_split_gain': 0.20016376757462126}. Best is trial 5 with value: 0.6904714789188336.


[250]	valid_0's average_precision: 0.677769
[300]	valid_0's average_precision: 0.676193
Early stopping, best iteration is:
[104]	valid_0's average_precision: 0.682348
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.660443
[100]	valid_0's average_precision: 0.672247
[150]	valid_0's average_precision: 0.677286
[200]	valid_0's average_precision: 0.675538
[250]	valid_0's average_precision: 0.67348
[300]	valid_0's average_precision: 0.672729


[I 2025-12-09 14:56:44,648] Trial 20 finished with value: 0.6780026979158988 and parameters: {'learning_rate': 0.051222324105519655, 'n_estimators': 1900, 'num_leaves': 55, 'max_depth': 4, 'min_child_samples': 184, 'subsample': 0.6580065273759165, 'colsample_bytree': 0.9064648559026968, 'reg_alpha': 0.004209073282554238, 'reg_lambda': 0.0009361122049775561, 'min_split_gain': 0.056085378296413024}. Best is trial 5 with value: 0.6904714789188336.


Early stopping, best iteration is:
[140]	valid_0's average_precision: 0.678003
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.676536
[100]	valid_0's average_precision: 0.677615
[150]	valid_0's average_precision: 0.679799
[200]	valid_0's average_precision: 0.682844
[250]	valid_0's average_precision: 0.684572
[300]	valid_0's average_precision: 0.685379
[350]	valid_0's average_precision: 0.684153
[400]	valid_0's average_precision: 0.686458
[450]	valid_0's average_precision: 0.686282
[500]	valid_0's average_precision: 0.685353
[550]	valid_0's average_precision: 0.685646
[600]	valid_0's average_precision: 0.685806


[I 2025-12-09 14:56:48,558] Trial 21 finished with value: 0.6871071052038525 and parameters: {'learning_rate': 0.011463055559644568, 'n_estimators': 1850, 'num_leaves': 89, 'max_depth': 8, 'min_child_samples': 47, 'subsample': 0.7550980573641108, 'colsample_bytree': 0.7722792903785042, 'reg_alpha': 0.02010817454257983, 'reg_lambda': 3.734564380811719, 'min_split_gain': 0.37977266952150446}. Best is trial 5 with value: 0.6904714789188336.


Early stopping, best iteration is:
[419]	valid_0's average_precision: 0.687107
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.6723
[100]	valid_0's average_precision: 0.680152
[150]	valid_0's average_precision: 0.681951
[200]	valid_0's average_precision: 0.683086
[250]	valid_0's average_precision: 0.682748
[300]	valid_0's average_precision: 0.681589
[350]	valid_0's average_precision: 0.680818


[I 2025-12-09 14:56:50,220] Trial 22 finished with value: 0.6833174472713385 and parameters: {'learning_rate': 0.0280438157089555, 'n_estimators': 1900, 'num_leaves': 90, 'max_depth': 8, 'min_child_samples': 75, 'subsample': 0.7200120731725965, 'colsample_bytree': 0.6985699842251951, 'reg_alpha': 0.0028685569128289816, 'reg_lambda': 6.348448763981449, 'min_split_gain': 0.4610759796138766}. Best is trial 5 with value: 0.6904714789188336.


[400]	valid_0's average_precision: 0.679555
Early stopping, best iteration is:
[201]	valid_0's average_precision: 0.683317
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.675905
[100]	valid_0's average_precision: 0.669436
[150]	valid_0's average_precision: 0.6666


[I 2025-12-09 14:56:51,770] Trial 23 finished with value: 0.6786881964092001 and parameters: {'learning_rate': 0.14002257808531088, 'n_estimators': 1250, 'num_leaves': 54, 'max_depth': -1, 'min_child_samples': 96, 'subsample': 0.7248132483478663, 'colsample_bytree': 0.8232858763255815, 'reg_alpha': 0.006120657016529097, 'reg_lambda': 0.00991441250639292, 'min_split_gain': 0.018519850086984244}. Best is trial 5 with value: 0.6904714789188336.


[200]	valid_0's average_precision: 0.65738
Early stopping, best iteration is:
[18]	valid_0's average_precision: 0.678688
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.674446
[100]	valid_0's average_precision: 0.677647
[150]	valid_0's average_precision: 0.678953
[200]	valid_0's average_precision: 0.679719
[250]	valid_0's average_precision: 0.682062
[300]	valid_0's average_precision: 0.68331
[350]	valid_0's average_precision: 0.681851
[400]	valid_0's average_precision: 0.682113
[450]	valid_0's average_precision: 0.684738
[500]	valid_0's average_precision: 0.684495
[550]	valid_0's average_precision: 0.684286
[600]	valid_0's average_precision: 0.684416


[I 2025-12-09 14:56:55,660] Trial 24 finished with value: 0.6853214165205721 and parameters: {'learning_rate': 0.010054497706045348, 'n_estimators': 1400, 'num_leaves': 112, 'max_depth': 8, 'min_child_samples': 58, 'subsample': 0.8123313415794513, 'colsample_bytree': 0.7542533070818238, 'reg_alpha': 0.05846073264902124, 'reg_lambda': 3.420717120918948, 'min_split_gain': 0.31296306875021185}. Best is trial 5 with value: 0.6904714789188336.


[650]	valid_0's average_precision: 0.684837
Early stopping, best iteration is:
[457]	valid_0's average_precision: 0.685321
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.636553
[100]	valid_0's average_precision: 0.652561
[150]	valid_0's average_precision: 0.659855
[200]	valid_0's average_precision: 0.663138
[250]	valid_0's average_precision: 0.667681
[300]	valid_0's average_precision: 0.670872
[350]	valid_0's average_precision: 0.672378
[400]	valid_0's average_precision: 0.671974
[450]	valid_0's average_precision: 0.671819
[500]	valid_0's average_precision: 0.671296


[I 2025-12-09 14:56:57,108] Trial 25 finished with value: 0.6729096684032134 and parameters: {'learning_rate': 0.017132834198442293, 'n_estimators': 1950, 'num_leaves': 16, 'max_depth': -1, 'min_child_samples': 53, 'subsample': 0.7339928673398929, 'colsample_bytree': 0.8790042126144365, 'reg_alpha': 8.710305417916404, 'reg_lambda': 0.01002916496135174, 'min_split_gain': 0.055745487516280806}. Best is trial 5 with value: 0.6904714789188336.


[550]	valid_0's average_precision: 0.67124
Early stopping, best iteration is:
[353]	valid_0's average_precision: 0.67291
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.65767
[100]	valid_0's average_precision: 0.671936
[150]	valid_0's average_precision: 0.674034
[200]	valid_0's average_precision: 0.6763
[250]	valid_0's average_precision: 0.678576
[300]	valid_0's average_precision: 0.680418
[350]	valid_0's average_precision: 0.681808
[400]	valid_0's average_precision: 0.682803
[450]	valid_0's average_precision: 0.684623
[500]	valid_0's average_precision: 0.685757
[550]	valid_0's average_precision: 0.686702
[600]	valid_0's average_precision: 0.687296
[650]	valid_0's average_precision: 0.687968
[700]	valid_0's average_precision: 0.688693
[750]	valid_0's average_precision: 0.688225
[800]	valid_0's average_precision: 0.688175


[I 2025-12-09 14:56:58,619] Trial 26 finished with value: 0.6887178311447969 and parameters: {'learning_rate': 0.02062470595325994, 'n_estimators': 1850, 'num_leaves': 52, 'max_depth': 4, 'min_child_samples': 12, 'subsample': 0.8616163296796211, 'colsample_bytree': 0.6859256826767183, 'reg_alpha': 0.03636247335636547, 'reg_lambda': 0.2507591813156733, 'min_split_gain': 0.5409574372796445}. Best is trial 5 with value: 0.6904714789188336.


[850]	valid_0's average_precision: 0.688106
Early stopping, best iteration is:
[698]	valid_0's average_precision: 0.688718
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.668373
[100]	valid_0's average_precision: 0.683526
[150]	valid_0's average_precision: 0.682173
[200]	valid_0's average_precision: 0.681148


[I 2025-12-09 14:56:59,091] Trial 27 finished with value: 0.683734477735256 and parameters: {'learning_rate': 0.0631615983736575, 'n_estimators': 1800, 'num_leaves': 21, 'max_depth': 4, 'min_child_samples': 55, 'subsample': 0.8362827138972705, 'colsample_bytree': 0.7303472759388985, 'reg_alpha': 0.007809199765079714, 'reg_lambda': 0.03849998347802852, 'min_split_gain': 0.6126079648736109}. Best is trial 5 with value: 0.6904714789188336.


[250]	valid_0's average_precision: 0.681331
Early stopping, best iteration is:
[97]	valid_0's average_precision: 0.683734
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.651378
[100]	valid_0's average_precision: 0.664138
[150]	valid_0's average_precision: 0.668288
[200]	valid_0's average_precision: 0.670575
[250]	valid_0's average_precision: 0.673501
[300]	valid_0's average_precision: 0.675407
[350]	valid_0's average_precision: 0.675458
[400]	valid_0's average_precision: 0.675797
[450]	valid_0's average_precision: 0.676184
[500]	valid_0's average_precision: 0.677603
[550]	valid_0's average_precision: 0.679863
[600]	valid_0's average_precision: 0.680607
[650]	valid_0's average_precision: 0.679498
[700]	valid_0's average_precision: 0.68018
[750]	valid_0's average_precision: 0.680653
[800]	valid_0's average_precision: 0.681371
[850]	valid_0's average_precision: 0.682016
[900]	valid_0's average_precision: 0.682589
[950]	valid_0's average_pr

[I 2025-12-09 14:57:01,356] Trial 28 finished with value: 0.685853878812704 and parameters: {'learning_rate': 0.010577902033443165, 'n_estimators': 1600, 'num_leaves': 92, 'max_depth': 4, 'min_child_samples': 15, 'subsample': 0.8863756142343089, 'colsample_bytree': 0.6191528862688613, 'reg_alpha': 1.1745348625378131, 'reg_lambda': 0.14324051215307942, 'min_split_gain': 0.3980903915179079}. Best is trial 5 with value: 0.6904714789188336.


[1300]	valid_0's average_precision: 0.684916
[1350]	valid_0's average_precision: 0.685282
Early stopping, best iteration is:
[1184]	valid_0's average_precision: 0.685854
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.653197
[100]	valid_0's average_precision: 0.668009
[150]	valid_0's average_precision: 0.674488
[200]	valid_0's average_precision: 0.675985
[250]	valid_0's average_precision: 0.677878
[300]	valid_0's average_precision: 0.678473
[350]	valid_0's average_precision: 0.68055
[400]	valid_0's average_precision: 0.681827
[450]	valid_0's average_precision: 0.682807
[500]	valid_0's average_precision: 0.683654
[550]	valid_0's average_precision: 0.685039
[600]	valid_0's average_precision: 0.685616
[650]	valid_0's average_precision: 0.686519
[700]	valid_0's average_precision: 0.687154
[750]	valid_0's average_precision: 0.687306
[800]	valid_0's average_precision: 0.686739
[850]	valid_0's average_precision: 0.685825
[900]	valid_0's averag

[I 2025-12-09 14:57:02,934] Trial 29 finished with value: 0.6873290974595205 and parameters: {'learning_rate': 0.015028662132076508, 'n_estimators': 2400, 'num_leaves': 47, 'max_depth': 4, 'min_child_samples': 27, 'subsample': 0.8163550021736568, 'colsample_bytree': 0.6202105548363357, 'reg_alpha': 0.020312274108370075, 'reg_lambda': 0.1691046474861341, 'min_split_gain': 0.796435542230459}. Best is trial 5 with value: 0.6904714789188336.


Early stopping, best iteration is:
[744]	valid_0's average_precision: 0.687329
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.677726
[100]	valid_0's average_precision: 0.679232
[150]	valid_0's average_precision: 0.679735
[200]	valid_0's average_precision: 0.68304
[250]	valid_0's average_precision: 0.684009
[300]	valid_0's average_precision: 0.685028
[350]	valid_0's average_precision: 0.684694
[400]	valid_0's average_precision: 0.685438
[450]	valid_0's average_precision: 0.685777
[500]	valid_0's average_precision: 0.686095
[550]	valid_0's average_precision: 0.68591
[600]	valid_0's average_precision: 0.685848
[650]	valid_0's average_precision: 0.686035
[700]	valid_0's average_precision: 0.685433


[I 2025-12-09 14:57:07,385] Trial 30 finished with value: 0.6864013113693422 and parameters: {'learning_rate': 0.009769209061363835, 'n_estimators': 1700, 'num_leaves': 53, 'max_depth': 10, 'min_child_samples': 44, 'subsample': 0.7725274767088244, 'colsample_bytree': 0.6723286322565598, 'reg_alpha': 0.0060841209017200295, 'reg_lambda': 0.019716885563972712, 'min_split_gain': 0.7687196131060924}. Best is trial 5 with value: 0.6904714789188336.


[750]	valid_0's average_precision: 0.684669
[800]	valid_0's average_precision: 0.68438
Early stopping, best iteration is:
[626]	valid_0's average_precision: 0.686401
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.676735
[100]	valid_0's average_precision: 0.684227
[150]	valid_0's average_precision: 0.685508
[200]	valid_0's average_precision: 0.683597
[250]	valid_0's average_precision: 0.681773


[I 2025-12-09 14:57:08,966] Trial 31 finished with value: 0.6855075666385929 and parameters: {'learning_rate': 0.03016655514313701, 'n_estimators': 2300, 'num_leaves': 53, 'max_depth': 10, 'min_child_samples': 46, 'subsample': 0.9138511189744357, 'colsample_bytree': 0.6353328093940386, 'reg_alpha': 0.057956215898599756, 'reg_lambda': 2.164270187217686, 'min_split_gain': 0.6871484938300318}. Best is trial 5 with value: 0.6904714789188336.


[300]	valid_0's average_precision: 0.680582
[350]	valid_0's average_precision: 0.680915
Early stopping, best iteration is:
[150]	valid_0's average_precision: 0.685508
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.653493
[100]	valid_0's average_precision: 0.668888
[150]	valid_0's average_precision: 0.674373
[200]	valid_0's average_precision: 0.675247
[250]	valid_0's average_precision: 0.678002
[300]	valid_0's average_precision: 0.678455
[350]	valid_0's average_precision: 0.680432
[400]	valid_0's average_precision: 0.681824
[450]	valid_0's average_precision: 0.681764
[500]	valid_0's average_precision: 0.682844
[550]	valid_0's average_precision: 0.68337
[600]	valid_0's average_precision: 0.68381
[650]	valid_0's average_precision: 0.685365
[700]	valid_0's average_precision: 0.686313
[750]	valid_0's average_precision: 0.686247
[800]	valid_0's average_precision: 0.686002
[850]	valid_0's average_precision: 0.685465


[I 2025-12-09 14:57:10,494] Trial 32 finished with value: 0.686429651811801 and parameters: {'learning_rate': 0.014793852809524569, 'n_estimators': 2800, 'num_leaves': 17, 'max_depth': 4, 'min_child_samples': 27, 'subsample': 0.8145161246224131, 'colsample_bytree': 0.623605243178975, 'reg_alpha': 0.0162363379975884, 'reg_lambda': 0.01516346121903361, 'min_split_gain': 0.41278690424420433}. Best is trial 5 with value: 0.6904714789188336.


[900]	valid_0's average_precision: 0.685006
Early stopping, best iteration is:
[707]	valid_0's average_precision: 0.68643
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.655865
[100]	valid_0's average_precision: 0.670231
[150]	valid_0's average_precision: 0.673007
[200]	valid_0's average_precision: 0.677284
[250]	valid_0's average_precision: 0.679851
[300]	valid_0's average_precision: 0.680987
[350]	valid_0's average_precision: 0.68288
[400]	valid_0's average_precision: 0.683177
[450]	valid_0's average_precision: 0.684715
[500]	valid_0's average_precision: 0.685702
[550]	valid_0's average_precision: 0.685262
[600]	valid_0's average_precision: 0.687052
[650]	valid_0's average_precision: 0.686878


[I 2025-12-09 14:57:11,713] Trial 33 finished with value: 0.6873682993185994 and parameters: {'learning_rate': 0.023876388220944955, 'n_estimators': 1500, 'num_leaves': 33, 'max_depth': 4, 'min_child_samples': 34, 'subsample': 0.885582727127363, 'colsample_bytree': 0.698243466950404, 'reg_alpha': 0.01630722628675901, 'reg_lambda': 1.9060405396035807, 'min_split_gain': 0.3944019085571256}. Best is trial 5 with value: 0.6904714789188336.


[700]	valid_0's average_precision: 0.68665
[750]	valid_0's average_precision: 0.686306
[800]	valid_0's average_precision: 0.686297
Early stopping, best iteration is:
[621]	valid_0's average_precision: 0.687368
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.665237
[100]	valid_0's average_precision: 0.679832
[150]	valid_0's average_precision: 0.685415
[200]	valid_0's average_precision: 0.686794
[250]	valid_0's average_precision: 0.68802
[300]	valid_0's average_precision: 0.687259
[350]	valid_0's average_precision: 0.68628
[400]	valid_0's average_precision: 0.684071


[I 2025-12-09 14:57:12,411] Trial 34 finished with value: 0.6885376582599366 and parameters: {'learning_rate': 0.06792104790774255, 'n_estimators': 800, 'num_leaves': 87, 'max_depth': 4, 'min_child_samples': 24, 'subsample': 0.8943883316782615, 'colsample_bytree': 0.6750799431658977, 'reg_alpha': 0.01856339515158022, 'reg_lambda': 2.556713874987819, 'min_split_gain': 0.1984483140959624}. Best is trial 5 with value: 0.6904714789188336.


[450]	valid_0's average_precision: 0.683467
Early stopping, best iteration is:
[271]	valid_0's average_precision: 0.688538
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.663686
[100]	valid_0's average_precision: 0.6789
[150]	valid_0's average_precision: 0.685202
[200]	valid_0's average_precision: 0.687644
[250]	valid_0's average_precision: 0.690313
[300]	valid_0's average_precision: 0.68799
[350]	valid_0's average_precision: 0.688029


[I 2025-12-09 14:57:13,126] Trial 35 finished with value: 0.6905462114954057 and parameters: {'learning_rate': 0.047200189967689266, 'n_estimators': 1050, 'num_leaves': 32, 'max_depth': 4, 'min_child_samples': 42, 'subsample': 0.9739732874700401, 'colsample_bytree': 0.6570583808610804, 'reg_alpha': 0.01562190879871248, 'reg_lambda': 0.317921120943722, 'min_split_gain': 0.11860590766767039}. Best is trial 35 with value: 0.6905462114954057.


[400]	valid_0's average_precision: 0.687788
Early stopping, best iteration is:
[247]	valid_0's average_precision: 0.690546
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.688533
[100]	valid_0's average_precision: 0.686862


[I 2025-12-09 14:57:14,189] Trial 36 finished with value: 0.6890677485047363 and parameters: {'learning_rate': 0.111017381214976, 'n_estimators': 950, 'num_leaves': 64, 'max_depth': 10, 'min_child_samples': 27, 'subsample': 0.960053147480697, 'colsample_bytree': 0.6700803826006128, 'reg_alpha': 0.4963294550030814, 'reg_lambda': 2.3248674522823074, 'min_split_gain': 0.08881276515762723}. Best is trial 35 with value: 0.6905462114954057.


[150]	valid_0's average_precision: 0.681565
[200]	valid_0's average_precision: 0.680791
[250]	valid_0's average_precision: 0.680791
Early stopping, best iteration is:
[52]	valid_0's average_precision: 0.689068
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.687759
[100]	valid_0's average_precision: 0.676841


[I 2025-12-09 14:57:14,831] Trial 37 finished with value: 0.6882605195172041 and parameters: {'learning_rate': 0.17809242979956885, 'n_estimators': 1300, 'num_leaves': 26, 'max_depth': 10, 'min_child_samples': 17, 'subsample': 0.85158425236166, 'colsample_bytree': 0.7433235351661844, 'reg_alpha': 0.8659378348580851, 'reg_lambda': 1.4443444149162987, 'min_split_gain': 0.06317142414896001}. Best is trial 35 with value: 0.6905462114954057.


[150]	valid_0's average_precision: 0.677935
[200]	valid_0's average_precision: 0.677935
[250]	valid_0's average_precision: 0.677935
Early stopping, best iteration is:
[53]	valid_0's average_precision: 0.688261
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.662379
[100]	valid_0's average_precision: 0.671278
[150]	valid_0's average_precision: 0.676481
[200]	valid_0's average_precision: 0.68119
[250]	valid_0's average_precision: 0.682606
[300]	valid_0's average_precision: 0.68292
[350]	valid_0's average_precision: 0.68242


[I 2025-12-09 14:57:15,611] Trial 38 finished with value: 0.6834245133300783 and parameters: {'learning_rate': 0.03834241296348381, 'n_estimators': 1050, 'num_leaves': 66, 'max_depth': 4, 'min_child_samples': 77, 'subsample': 0.9979753462802349, 'colsample_bytree': 0.6136270648023046, 'reg_alpha': 0.04071040859028056, 'reg_lambda': 0.05532048996280229, 'min_split_gain': 0.04401054225337449}. Best is trial 35 with value: 0.6905462114954057.


[400]	valid_0's average_precision: 0.682222
[450]	valid_0's average_precision: 0.682729
Early stopping, best iteration is:
[261]	valid_0's average_precision: 0.683425
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.682444
[100]	valid_0's average_precision: 0.678297


[I 2025-12-09 14:57:16,598] Trial 39 finished with value: 0.6831469268092395 and parameters: {'learning_rate': 0.17685099631350062, 'n_estimators': 800, 'num_leaves': 67, 'max_depth': 10, 'min_child_samples': 18, 'subsample': 0.9601984378067066, 'colsample_bytree': 0.6110660035705456, 'reg_alpha': 0.0382795742020348, 'reg_lambda': 7.488551007865272, 'min_split_gain': 0.1100287678561767}. Best is trial 35 with value: 0.6905462114954057.


[150]	valid_0's average_precision: 0.680819
[200]	valid_0's average_precision: 0.679108
[250]	valid_0's average_precision: 0.679108
Early stopping, best iteration is:
[52]	valid_0's average_precision: 0.683147
[OPTUNA] Mejor AP(val): 0.690546
[OPTUNA] Params: {'learning_rate': 0.047200189967689266, 'n_estimators': 1050, 'num_leaves': 32, 'max_depth': 4, 'min_child_samples': 42, 'subsample': 0.9739732874700401, 'colsample_bytree': 0.6570583808610804, 'reg_alpha': 0.01562190879871248, 'reg_lambda': 0.317921120943722, 'min_split_gain': 0.11860590766767039}
[OPTUNA] best_iteration: 247
[OPTUNA] Guardado BEST en: BEST_LGBM_REDUCED_PERMcv_thr_SMOTENC.json
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.672273
[100]	valid_0's average_precision: 0.681697
[150]	valid_0's average_precision: 0.682111
[200]	valid_0's average_precision: 0.679085
[250]	valid_0's average_precision: 0.677701
[300]	valid_0's average_precision: 0.677549
Early stopping, b

## 9 — Cross-Validation (OOF) para baseline/tuned

In [148]:
def run_oof_cv(model_params, X, y, k_folds=CV_FOLDS, seed=RANDOM_STATE, exp_suffix="BASELINE"):

    X_arr = np.asarray(X)
    y_arr = np.asarray(y).ravel()
    if X_arr.shape[0] != y_arr.shape[0]:
        raise ValueError(f"X y y deben tener el mismo número de filas; X={X_arr.shape}, y={y_arr.shape}")

    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    oof_proba = np.zeros(y_arr.shape[0], dtype=float)
    fold_rows = []

    base_params = dict(model_params)
    base_params.pop("verbose", None)
    base_params.setdefault("verbosity", -1)
    base_params.setdefault("metric", "average_precision")

    _fit_kwargs_local = {}
    _callbacks_local = []
    if _LGBM_CB_OK:
        _callbacks_local = [early_stopping(stopping_rounds=200), log_evaluation(period=50)]
    else:
        _fit_kwargs_local["early_stopping_rounds"] = 200

    for f, (tr_idx, va_idx) in enumerate(skf.split(X_arr, y_arr), 1):
        X_tr, X_va = X_arr[tr_idx], X_arr[va_idx]
        y_tr, y_va = y_arr[tr_idx], y_arr[va_idx]

        # ← aquí estaba el error: usar 'and' en lugar de '&&'
        if USE_BALANCED_TRAIN and BALANCE_IN_CV:
            X_tr, y_tr = SMOTE(random_state=RANDOM_STATE).fit_resample(X_tr, y_tr)

        mdl = LGBMClassifier(**base_params)
        _fit_call = dict(
            eval_set=[(X_va, y_va)],
            eval_metric="aucpr",
            **_fit_kwargs_local
        )
        if _LGBM_CB_OK:
            _fit_call["callbacks"] = _callbacks_local

        mdl.fit(X_tr, y_tr, **_fit_call)

        proba_va = pred_proba_best(mdl, X_va)
        oof_proba[va_idx] = proba_va

        fold_rows.append({
            "fold": f,
            "pr_auc": average_precision_score(y_va, proba_va),
            "roc_auc": roc_auc_score(y_va, proba_va)
        })

    # Métricas OOF
    oof_pr  = average_precision_score(y_arr, oof_proba)
    oof_roc = roc_auc_score(y_arr, oof_proba)
    thr_oof, _ = find_best_threshold(y_arr, oof_proba, metric="f1")
    y_oof_pred = (oof_proba >= thr_oof).astype(int)
    oof_f1  = f1_score(y_arr, y_oof_pred, zero_division=0)
    oof_rec = recall_score(y_arr, y_oof_pred, zero_division=0)
    oof_bal = balanced_accuracy_score(y_arr, y_oof_pred)

    # Persistencia
    cv_tag = f"{EXP_NAME}_{exp_suffix}_CV{CV_FOLDS}"
    cv_csv = OUT_RESULTS / f"cv_summary_{cv_tag}.csv"
    folds_df = pd.DataFrame(fold_rows)
    agg_row = pd.DataFrame([{
        "fold": "OOF", "pr_auc": oof_pr, "roc_auc": oof_roc,
        "thr": float(thr_oof), "f1": oof_f1, "recall": oof_rec, "bal_acc": oof_bal
    }])
    cv_df = pd.concat([folds_df, agg_row], ignore_index=True)
    cv_df.to_csv(cv_csv, index=False)

    oof_path = OUT_PREDS / f"oof_{cv_tag}.parquet"
    pd.DataFrame({"oof_proba": oof_proba, "y_true": y_arr}).to_parquet(oof_path, index=False)

    print(f"[CV-{exp_suffix}] Guardados: {cv_csv.name} | {oof_path.name}")
    return {
        "oof_pr_auc": oof_pr,
        "oof_roc_auc": oof_roc,
        "thr": float(thr_oof),
        "oof_f1": oof_f1,
        "oof_recall": oof_rec,
        "oof_bal_acc": oof_bal
    }

## 10 — Evaluación en test + guardados

In [149]:
def plot_pr_curve(y_true, y_proba, title, out_path):
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.step(rec, prec, where='post')
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title(f'{title} (AP={ap:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def plot_roc_curve(y_true, y_proba, title, out_path):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2); plt.plot([0,1],[0,1], 'k--', lw=1)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title(f'{title} (AUC={auc:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def plot_confusion(y_true, y_pred, title, out_path, normalize=False):
    norm = 'true' if normalize else None
    cm = confusion_matrix(y_true, y_pred, normalize=norm)
    plt.figure(figsize=(5,4))
    im = plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(title); plt.colorbar(im, fraction=0.046, pad=0.04)
    ticks = np.arange(2)
    plt.xticks(ticks, ['0','1']); plt.yticks(ticks, ['0','1'])
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            txt = f'{cm[i,j]:.2f}' if normalize else str(cm[i,j])
            plt.text(j, i, txt, ha='center', va='center',
                     color='white' if cm[i,j] > thresh else 'black')
    plt.ylabel('True label'); plt.xlabel('Predicted label')
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

base = EXP_NAME

# === BASELINE ===
proba_test = pred_proba_best(model, X_test_fit)

cv_base = run_oof_cv(model.get_params(), X_train_fit, y_train, exp_suffix="BASELINE") if DO_CV_BASELINE else {"thr": 0.5}
thr_oof = float(cv_base["thr"])
thr_used = thr_oof

y_pred_test = (proba_test >= thr_used).astype(int)
test_metrics = compute_all_metrics(y_test, proba_test, thr_used)

# Guardar HP baseline
params_seed_path = OUT_PARAMS / f"{base}_BASE_seed_params.json"
with open(params_seed_path, "w", encoding="utf-8") as f:
    json.dump(seed_params, f, indent=2, ensure_ascii=False)

params_fitted_path = OUT_PARAMS / f"{base}_BASE_fitted_params.json"
with open(params_fitted_path, "w", encoding="utf-8") as f:
    json.dump(model.get_params(), f, indent=2, ensure_ascii=False)

# Figuras baseline
plot_pr_curve(y_val,  proba_val,  f"{base} — PR (val)",  OUT_FIGS / f"{base}_pr_val.png")
plot_pr_curve(y_test, proba_test, f"{base} — PR (test)", OUT_FIGS / f"{base}_pr_test.png")
plot_roc_curve(y_val,  proba_val,  f"{base} — ROC (val)",  OUT_FIGS / f"{base}_roc_val.png")
plot_roc_curve(y_test, proba_test, f"{base} — ROC (test)", OUT_FIGS / f"{base}_roc_test.png")
plot_confusion(y_test, y_pred_test,
               f"{base} — Confusion (test @thr_used={thr_used:.3f})",
               OUT_FIGS / f"{base}_cm_test.png")

# Importancias baseline
try:
    imp_gain = model.booster_.feature_importance(importance_type="gain")
except Exception:
    imp_gain = model.feature_importances_
imp_df = (
    pd.DataFrame({
        "feature": feature_names_used[:len(imp_gain)],
        "importance_gain": imp_gain
    }).sort_values("importance_gain", ascending=False)
)
imp_path = OUT_RESULTS / f"{base}_feature_importances.csv"
imp_df.to_csv(imp_path, index=False)

# Preds test baseline
preds_path = OUT_PREDS / f"preds_test_{base}.parquet"
pd.DataFrame({"proba": proba_test, "y_true": y_test}).to_parquet(preds_path, index=False)

# Registro de resultados baseline
row_base = {
    "model": base,
    "thr_val": thr_val,
    "thr_oof": thr_oof,
    "thr_used": thr_used,
    "val_pr_auc": val_metrics["pr_auc"],
    "val_roc_auc": val_metrics["roc_auc"],
    "val_precision": val_metrics["precision"],
    "val_f1": val_metrics["f1"],
    "val_recall": val_metrics["recall"],
    "val_bal_acc": val_metrics["bal_acc"],
    "test_pr_auc": test_metrics["pr_auc"],
    "test_roc_auc": test_metrics["roc_auc"],
    "test_precision": test_metrics["precision"],
    "test_f1": test_metrics["f1"],
    "test_recall": test_metrics["recall"],
    "test_bal_acc": test_metrics["bal_acc"],
    "best_iteration": best_iter
}
res_csv = OUT_RESULTS / "baselines.csv"
pd.DataFrame([row_base]).to_csv(
    res_csv,
    mode=("a" if res_csv.exists() else "w"),
    index=False,
    header=not res_csv.exists()
)

print("[OK][BASE] Guardados:",
      "\n  - Seed HPs   :", params_seed_path.name,
      "\n  - Fitted HPs :", params_fitted_path.name,
      "\n  - Importancias:", imp_path.name,
      "\n  - Preds test  :", preds_path.name,
      "\n  - Baselines   :", res_csv.name)

# === TUNED ===
if DO_TUNE and (tuned_model is not None):
    proba_val_tuned = pred_proba_best(tuned_model, X_val_fit)
    thr_val_tuned, _ = find_best_threshold(y_val, proba_val_tuned, metric="f1")
    val_metrics_tuned = compute_all_metrics(y_val, proba_val_tuned, thr_val_tuned)

    proba_test_tuned = pred_proba_best(tuned_model, X_test_fit)

    cv_tuned = run_oof_cv(tuned_model.get_params(), X_train_fit, y_train, exp_suffix="TUNED") if DO_CV_TUNED else {"thr": 0.5}
    thr_oof_tuned = float(cv_tuned["thr"])
    thr_used_tuned = thr_oof_tuned

    y_pred_test_tuned = (proba_test_tuned >= thr_used_tuned).astype(int)
    test_metrics_tuned = compute_all_metrics(y_test, proba_test_tuned, thr_used_tuned)

    tuned_fitted_path = OUT_PARAMS / f"{base}_TUNED_fitted_params.json"
    with open(tuned_fitted_path, "w", encoding="utf-8") as f:
        json.dump(tuned_model.get_params(), f, indent=2, ensure_ascii=False)

    base_t = base + "_TUNED"
    plot_pr_curve(y_val,  proba_val_tuned,  f"{base_t} — PR (val)",  OUT_FIGS / f"{base_t}_pr_val.png")
    plot_pr_curve(y_test, proba_test_tuned, f"{base_t} — PR (test)", OUT_FIGS / f"{base_t}_pr_test.png")
    plot_roc_curve(y_val,  proba_val_tuned,  f"{base_t} — ROC (val)",  OUT_FIGS / f"{base_t}_roc_val.png")
    plot_roc_curve(y_test, proba_test_tuned, f"{base_t} — ROC (test)", OUT_FIGS / f"{base_t}_roc_test.png")
    plot_confusion(y_test, y_pred_test_tuned,
                   f"{base_t} — Confusion (test @thr_used={thr_used_tuned:.3f})",
                   OUT_FIGS / f"{base_t}_cm_test.png")

    try:
        imp_gain_t = tuned_model.booster_.feature_importance(importance_type="gain")
    except Exception:
        imp_gain_t = tuned_model.feature_importances_
    imp_t_path = OUT_RESULTS / f"{base_t}_feature_importances.csv"
    (
        pd.DataFrame({"feature": feature_names_used[:len(imp_gain_t)], "importance_gain": imp_gain_t})
        .sort_values("importance_gain", ascending=False)
        .to_csv(imp_t_path, index=False)
    )

    preds_t_path = OUT_PREDS / f"preds_test_{base_t}.parquet"
    pd.DataFrame({"proba": proba_test_tuned, "y_true": y_test}).to_parquet(preds_t_path, index=False)

    row_t = {
        "model": base_t,
        "thr_val": thr_val_tuned,
        "thr_oof": thr_oof_tuned,
        "thr_used": thr_used_tuned,
        "val_pr_auc": val_metrics_tuned["pr_auc"],
        "val_roc_auc": val_metrics_tuned["roc_auc"],
        "val_precision": val_metrics_tuned["precision"],
        "val_f1": val_metrics_tuned["f1"],
        "val_recall": val_metrics_tuned["recall"],
        "val_bal_acc": val_metrics_tuned["bal_acc"],
        "test_pr_auc": test_metrics_tuned["pr_auc"],
        "test_roc_auc": test_metrics_tuned["roc_auc"],
        "test_precision": test_metrics_tuned["precision"],
        "test_f1": test_metrics_tuned["f1"],
        "test_recall": test_metrics_tuned["recall"],
        "test_bal_acc": test_metrics_tuned["bal_acc"],
        "best_iteration": getattr(tuned_model, "best_iteration_", None)
    }
    pd.DataFrame([row_t]).to_csv(res_csv, mode="a", index=False, header=False)

    print("[OK][TUNED] Guardados:",
          "\n  - Fitted HPs :", tuned_fitted_path.name,
          "\n  - Importancias:", imp_t_path.name,
          "\n  - Preds test  :", preds_t_path.name,
          "\n  - Baselines   :", res_csv.name)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.679983
[100]	valid_0's average_precision: 0.689697
[150]	valid_0's average_precision: 0.69089
[200]	valid_0's average_precision: 0.694497
[250]	valid_0's average_precision: 0.694325
[300]	valid_0's average_precision: 0.692923
[350]	valid_0's average_precision: 0.690636
Early stopping, best iteration is:
[189]	valid_0's average_precision: 0.696254
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.683824
[100]	valid_0's average_precision: 0.687847
[150]	valid_0's average_precision: 0.691194
[200]	valid_0's average_precision: 0.690535
[250]	valid_0's average_precision: 0.689051
[300]	valid_0's average_precision: 0.684943
[350]	valid_0's average_precision: 0.683223
Early stopping, best iteration is:
[166]	valid_0's average_precision: 0.692515
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.715454


## 11 — Mejores Resultados + CV-OOF + SOTA + Reporte de features eliminadas

In [150]:
AGGREGATE_ALL_RUNS = False

def safe(v, fmt=".4f"):
    try:
        return f"{float(v):{fmt}}"
    except Exception:
        return "NA"

base_csv = OUT_RESULTS / "baselines.csv"
if not base_csv.exists():
    raise FileNotFoundError(f"No existe {base_csv}")

df = pd.read_csv(base_csv)

needed = [
    "model",
    "thr_val", "thr_oof", "thr_used",
    "val_pr_auc","val_roc_auc","val_precision","val_f1","val_recall","val_bal_acc",
    "test_pr_auc","test_roc_auc","test_precision","test_f1","test_recall","test_bal_acc",
    "best_iteration"
]
for c in needed:
    if c not in df.columns:
        df[c] = pd.NA
df = df[needed].copy()

num_cols = [c for c in needed if c not in ("model",)]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

if AGGREGATE_ALL_RUNS:
    root_art = ARTIF_DIR.parent
    for p in (root_art).glob("LGBM_*/results/baselines.csv"):
        if p == base_csv: continue
        try:
            d2 = pd.read_csv(p)
            for c in needed:
                if c not in d2.columns:
                    d2[c] = pd.NA
            d2 = d2[needed]
            for c in num_cols:
                d2[c] = pd.to_numeric(d2[c], errors="coerce")
            df = pd.concat([df, d2], ignore_index=True)
        except Exception:
            pass

if df.empty:
    raise ValueError("El dataframe de resultados está vacío.")

def best_by(metric):
    if metric not in df.columns or df[metric].dropna().empty:
        return None
    r = df.loc[df[metric].idxmax()]
    print(
        f"- {metric}: {r['model']} | "
        f"PR-AUC={safe(r['test_pr_auc'])} | "
        f"ROC-AUC={safe(r['test_roc_auc'])} | "
        f"F1={safe(r['test_f1'])} | "
        f"Recall={safe(r['test_recall'])} | "
        f"Precision={safe(r['test_precision'])} | "
        f"thr_used={safe(r['thr_used'], '.3f')} | "
        f"best_iter={int(r['best_iteration']) if pd.notna(r['best_iteration']) else 'NA'}"
    )
    return r

print("=== MEJORES EN TEST (por métrica) ===")
winners = {}
for m in ["test_pr_auc","test_roc_auc","test_recall","test_f1","test_precision"]:
    w = best_by(m)
    if w is not None: winners[m] = w

# Resumen CV-OOF
cv_files = list(OUT_RESULTS.glob("cv_summary_*_CV*.csv"))
if cv_files:
    print("\n=== RESUMEN CV-OOF ===")
    rows = []
    for f in cv_files:
        tag = re.sub(r"^cv_summary_|\.csv$", "", f.name)
        try:
            cv = pd.read_csv(f)
            oof = cv.loc[cv["fold"] == "OOF"]
            if not oof.empty:
                r = oof.iloc[0]
                rows.append({
                    "tag": tag,
                    "pr_auc": r.get("pr_auc"),
                    "roc_auc": r.get("roc_auc"),
                    "f1": r.get("f1"),
                    "recall": r.get("recall"),
                    "bal_acc": r.get("bal_acc"),
                    "thr": r.get("thr"),
                })
        except Exception:
            pass
    if rows:
        print(pd.DataFrame(rows).sort_values(["pr_auc","roc_auc"], ascending=False).to_string(index=False))
else:
    print("\n(No se hallaron archivos de CV para este experimento)")

# SOTA
SOTA = {"LGBM": {"AUC": 0.914, "Recall": 0.881, "Precision": 0.948}}
if "test_roc_auc" in winners:
    bt = winners["test_roc_auc"]
    try:
        d_auc = float(bt["test_roc_auc"]) - SOTA["LGBM"]["AUC"]
        d_rec = float(bt["test_recall"])   - SOTA["LGBM"]["Recall"]
        print("\n=== COMPARACIÓN SOTA vs. MEJOR TEST ===")
        print(f"Paper LGBM: AUC={SOTA['LGBM']['AUC']:.3f} | Recall={SOTA['LGBM']['Recall']:.3f} | Precision={SOTA['LGBM']['Precision']:.3f}")
        print(f"Mejor  : AUC={safe(bt['test_roc_auc'])} | Recall={safe(bt['test_recall'])} | Precision={safe(bt['test_precision'])}")
        print(f"Deltas : ΔAUC={d_auc:+.3f} | ΔRecall={d_rec:+.3f}")
    except Exception:
        pass

# === Reporte de features eliminadas ===
fs_path = OUT_RESULTS / f"{EXP_NAME}_feature_sets.json"
drop_md_path = OUT_RESULTS / f"{EXP_NAME}_feature_drop_report.md"

if fs_path.exists():
    fs = json.loads(fs_path.read_text(encoding="utf-8"))
    all_feats  = fs.get("all_features", [])
    used_feats = fs.get("used_features", [])
    all_set, used_set = set(all_feats), set(used_feats)
    removed = sorted(list(all_set - used_set))
    kept    = sorted(list(used_set))
    msg1 = f"[FEATURES] Total={len(all_set)} | Kept={len(kept)} | Removed={len(removed)}"
    print("\n=== FEATURES ELIMINADAS ===")
    print(msg1)
    # Listados
    max_list = 120
    if kept:
        print(f"- Kept (primeros {min(len(kept), max_list)}): {kept[:max_list]}" + (" ..." if len(kept)>max_list else ""))
    if removed:
        print(f"- Removed (primeros {min(len(removed), max_list)}): {removed[:max_list]}" + (" ..." if len(removed)>max_list else ""))

    drop_md = "\n".join([
        "# Feature Drop Report",
        msg1,
        f"- Kept ({len(kept)}): {kept}",
        f"- Removed ({len(removed)}): {removed}",
        f"- Reducción usada: {fs.get('reduction')}"
    ])
    drop_md_path.write_text(drop_md, encoding="utf-8")
    print("[OK] Feature report guardado en:", drop_md_path.name)
else:
    print("\n[FEATURES] No se encontró", fs_path.name, "(no se puede calcular removidas).")

# Normalización con backup
backup = OUT_RESULTS / "baselines_legacy_backup.csv"
base_csv.replace(backup)
df.to_csv(base_csv, index=False)
print("[OK] Normalizado. Backup:", backup.name)

=== MEJORES EN TEST (por métrica) ===
- test_pr_auc: LGBM_REDUCED_PERMcv_thr_SMOTENC | PR-AUC=0.7086 | ROC-AUC=0.8655 | F1=0.6354 | Recall=0.6658 | Precision=0.6076 | thr_used=0.514 | best_iter=247
- test_roc_auc: LGBM_REDUCED_PERMcv_thr_SMOTENC | PR-AUC=0.7086 | ROC-AUC=0.8655 | F1=0.6354 | Recall=0.6658 | Precision=0.6076 | thr_used=0.514 | best_iter=247
- test_recall: LGBM_REDUCED_PERMcv_thr_SMOTENC | PR-AUC=0.7086 | ROC-AUC=0.8655 | F1=0.6354 | Recall=0.6658 | Precision=0.6076 | thr_used=0.514 | best_iter=247
- test_f1: LGBM_REDUCED_PERMcv_thr_SMOTENC | PR-AUC=0.7086 | ROC-AUC=0.8655 | F1=0.6354 | Recall=0.6658 | Precision=0.6076 | thr_used=0.514 | best_iter=247
- test_precision: LGBM_REDUCED_PERMcv_thr_SMOTENC_TUNED | PR-AUC=0.7070 | ROC-AUC=0.8654 | F1=0.5849 | Recall=0.4570 | Precision=0.8122 | thr_used=0.514 | best_iter=135

=== RESUMEN CV-OOF ===
                                         tag   pr_auc  roc_auc       f1   recall  bal_acc   thr
LGBM_REDUCED_PERMcv_thr_SMOTENC_BASE