1 — Imports, configuración y rutas

In [147]:
import json, os, warnings, time, re, glob
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, recall_score, balanced_accuracy_score, confusion_matrix, precision_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif

# Balanceo (SMOTENC)
try:
    from imblearn.over_sampling import SMOTENC, SMOTE
    _HAS_IMBLEARN = True
except Exception:
    _HAS_IMBLEARN = False

# XGBoost
from xgboost import XGBClassifier

# === Toggles de experimento ===
USE_REDUCED = False              # selección MI top-k
USE_BALANCED_TRAIN = True        # SMOTE sobre el train final
BALANCE_IN_CV = True             # SMOTE dentro de cada fold de CV
RANDOM_STATE = 42
DO_TUNE = True
DO_CV_BASELINE = True
DO_CV_TUNED = True
CV_FOLDS = 5
MI_TOPK = 30                     # K para selección MI si USE_REDUCED=True

# === Nombres y rutas ===
ROOT = Path.cwd().parent
EXP_NAME = f"XGB_{'REDUCED' if USE_REDUCED else 'FULL'}_{'SMOTENC' if USE_BALANCED_TRAIN else 'IMB'}"
ARTIF_DIR = ROOT / "artifacts" / EXP_NAME
OUT_RESULTS = ARTIF_DIR / "results"
OUT_FIGS    = ARTIF_DIR / "figs"
OUT_PREDS   = ARTIF_DIR / "preds"
OUT_PARAMS  = ARTIF_DIR / "best_params"
for p in [OUT_RESULTS, OUT_FIGS, OUT_PREDS, OUT_PARAMS]:
    p.mkdir(parents=True, exist_ok=True)

# Dataset preprocesado
DATA_DIR = ROOT / "preproc_datasets" / "full"

print("Exp:", EXP_NAME)
print("DATA_DIR:", DATA_DIR)
print("ARTIF_DIR:", ARTIF_DIR)

Exp: XGB_FULL_SMOTENC
DATA_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/preproc_datasets/full
ARTIF_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/XGB_FULL_SMOTENC


2 — Carga de artefactos (X, y, features)

In [148]:
import json
def load_xy_full(dir_full: Path):
    X_train = np.load(dir_full / "X_train_full.npy")
    X_val   = np.load(dir_full / "X_val_full.npy")
    X_test  = np.load(dir_full / "X_test_full.npy")

    y_train = pd.read_parquet(dir_full / "y_train.parquet")["Exited"].to_numpy()
    y_val   = pd.read_parquet(dir_full / "y_val.parquet")["Exited"].to_numpy()
    y_test  = pd.read_parquet(dir_full / "y_test.parquet")["Exited"].to_numpy()

    feat = pd.read_parquet(dir_full / "feature_names_full.parquet")["feature"].tolist()
    return X_train, y_train, X_val, y_val, X_test, y_test, feat

X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_xy_full(DATA_DIR)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y train/val/test:", y_train.shape, y_val.shape, y_test.shape)
print("n features:", len(feature_names))

def _read_feature_roles(dir_full: Path):
    candidates = [
        dir_full / "feature_roles_full.parquet",
        dir_full / "feature_roles.parquet",
        dir_full / "feature_meta_full.parquet",
        dir_full / "feature_meta.parquet",
        dir_full / "feature_types_full.parquet",
        dir_full / "feature_types.parquet",
        dir_full / "feature_meta.json",
    ]
    for p in candidates:
        if p.exists():
            if p.suffix == ".parquet":
                df = pd.read_parquet(p)
            elif p.suffix == ".json":
                obj = json.loads(p.read_text())
                # admitimos lista de dicts o dict con lista
                if isinstance(obj, dict) and "features" in obj:
                    df = pd.DataFrame(obj["features"])
                else:
                    df = pd.DataFrame(obj)
            else:
                continue
            return df, p.name
    return None, None

def _build_cat_idx(feature_names, roles_df):

    if roles_df is None or len(roles_df) == 0:
        return []
    df = roles_df.copy()
    df.columns = [str(c).lower() for c in df.columns]

    if "feature" not in df.columns:
        if "name" in df.columns:
            df["feature"] = df["name"]
        else:
            return []

    cat_names = set()
    if "role" in df.columns:
        cat_names = set(
            df.loc[df["role"].astype(str).str.lower().isin(
                ["cat", "categorical", "bin", "binary", "ordinal"]
            ), "feature"]
        )
    elif "dtype" in df.columns:
        cat_names = set(
            df.loc[df["dtype"].astype(str).str.lower().isin(
                ["category", "categorical", "object", "bool"]
            ), "feature"]
        )
    elif "is_cat" in df.columns:
        cat_names = set(df.loc[df["is_cat"].astype(bool), "feature"])
    else:
        return []

    idx = [i for i, f in enumerate(feature_names) if f in cat_names]
    return sorted(idx)

roles_df, meta_file = _read_feature_roles(DATA_DIR)
CAT_IDX_FULL = _build_cat_idx(feature_names, roles_df)

print(f"[META] Archivo detectado: {meta_file or 'N/D'} | columnas categóricas={len(CAT_IDX_FULL)}")
if CAT_IDX_FULL:
    preview = CAT_IDX_FULL[:10]
    print(f"[META] Índices categóricos (primeros 10): {preview}{'...' if len(CAT_IDX_FULL) > 10 else ''}")
else:
    print("[META] No se detectaron columnas categóricas en metadatos. Caerá en SMOTE estándar.")

Shapes: (6000, 15) (2000, 15) (2000, 15)
y train/val/test: (6000,) (2000,) (2000,)
n features: 15
[META] Archivo detectado: N/D | columnas categóricas=0
[META] No se detectaron columnas categóricas en metadatos. Caerá en SMOTE estándar.


3 — Métricas, threshold y plots

In [149]:
def pr_auc(y_true, y_proba): 
    return float(average_precision_score(y_true, y_proba))

def roc_auc(y_true, y_proba): 
    return float(roc_auc_score(y_true, y_proba))

def find_best_threshold(y_true, y_proba, metric="f1"):
    thr_grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for thr in thr_grid:
        y_pred = (y_proba >= thr).astype(int)
        if metric == "f1":
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == "recall":
            score = recall_score(y_true, y_pred, zero_division=0)
        else:
            raise ValueError("metric no soportada")
        if score > best_score:
            best_score, best_thr = score, thr
    return float(best_thr), float(best_score)

def compute_all_metrics(y_true, y_proba, thr):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "pr_auc": pr_auc(y_true, y_proba),
        "roc_auc": roc_auc(y_true, y_proba),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "bal_acc": balanced_accuracy_score(y_true, y_pred)
    }

4 — Helpers MI Top-K y balanceo in-memory

In [150]:
import xgboost as xgb

class _BoosterAdapter:
    def __init__(self, booster, params, best_iteration, feature_names=None):
        self._booster = booster
        self._params = dict(params)
        self.best_iteration = best_iteration
        self._feature_names = feature_names

    def predict_proba(self, X):
        d = xgb.DMatrix(X, feature_names=self._feature_names)
        if self.best_iteration is not None:
            pred = self._booster.predict(d, iteration_range=(0, int(self.best_iteration) + 1))
        else:
            pred = self._booster.predict(d)
        return np.column_stack([1.0 - pred, pred])

    def get_booster(self):
        return self._booster

    def get_params(self, deep=True):
        return dict(self._params)

def xgb_fit_with_es(
    sk_model, X_tr, y_tr, X_va, y_va,
    feature_names=None, rounds=200, verbose=False
):
    p = sk_model.get_params()
    n_estimators = p.pop("n_estimators", 1000)
    n_estimators = 1000 if n_estimators is None else int(n_estimators)

    seed = p.pop("random_state", p.pop("seed", 42))
    nthread = p.pop("n_jobs", None)
    if nthread is not None:
        p["nthread"] = nthread  # alias clásico

    p.setdefault("seed", seed)
    p.setdefault("objective", "binary:logistic")
    p.setdefault("eval_metric", "aucpr")

    dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=feature_names)
    dvalid = xgb.DMatrix(X_va, label=y_va, feature_names=feature_names)

    evals = [(dtrain, "train"), (dvalid, "valid")]
    booster = xgb.train(
        params=p,
        dtrain=dtrain,
        num_boost_round=n_estimators,
        evals=evals,
        early_stopping_rounds=rounds,
        verbose_eval=verbose
    )

    best_iter = getattr(booster, "best_iteration", None)
    adapter = _BoosterAdapter(
        booster=booster,
        params={**sk_model.get_params(), "best_iteration": best_iter},
        best_iteration=best_iter,
        feature_names=feature_names
    )
    return adapter, best_iter

def _map_cat_idx_for_keep(keep_idx, cat_idx_full):
    if not cat_idx_full:
        return []
    if keep_idx is None:
        return sorted(cat_idx_full)
    pos = {old_i: j for j, old_i in enumerate(keep_idx)}
    return sorted([pos[i] for i in cat_idx_full if i in pos])

try:
    apply_keep_idx
except NameError:
    def apply_keep_idx(X, keep_idx):
        return X[:, keep_idx]

def maybe_smote(X, y, keep_idx=None, random_state=RANDOM_STATE, k_neighbors=5):

    if not _HAS_IMBLEARN:
        print("[BAL] imbalanced-learn no disponible. Se omite balanceo.")
        return X, y

    y_int = y.astype(int)
    if y_int.max() == 0:     
        print("[BAL] Solo 1 clase en y. Se omite balanceo.")
        return X, y
    counts = np.bincount(y_int)
    if len(counts) < 2 or counts.min() < 2:
        print("[BAL] Minoría < 2 muestras. Se omite balanceo.")
        return X, y
    k = int(max(1, min(k_neighbors, counts.min() - 1)))


    cat_idx = _map_cat_idx_for_keep(keep_idx, CAT_IDX_FULL)

    if cat_idx:
        sm = SMOTENC(categorical_features=cat_idx, k_neighbors=k, random_state=random_state)
        kind = "SMOTENC"
    else:
        sm = SMOTE(k_neighbors=k, random_state=random_state)
        kind = "SMOTE"

    X_res, y_res = sm.fit_resample(X, y)
    try:
        X_res = X_res.astype(X.dtype, copy=False)
    except Exception:
        pass

    print(f"[BAL] {kind} aplicado | k_neighbors={k} | cat_cols={len(cat_idx)}")
    return X_res, y_res

5 — Hiperparámetros persistentes

In [151]:
VIEW_TAG = "REDUCED" if USE_REDUCED else "FULL"
BAL_TAG  = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"
BEST_HP_FILE = OUT_PARAMS / f"BEST_XGB_{VIEW_TAG}_{BAL_TAG}.json"

def get_xgb_defaults(seed=RANDOM_STATE):
    mdl = XGBClassifier(
        random_state=seed,
        n_jobs=-1,
        eval_metric="aucpr",
        tree_method="hist",
        verbosity=0,
    )
    params = mdl.get_params()
    # normalizamos la clave de verbosity
    params.pop("verbose", None)
    params.setdefault("verbosity", 0)
    return params

def load_best_or_default():
    if BEST_HP_FILE.exists():
        try:
            best = json.loads(BEST_HP_FILE.read_text())
            print("[HP] Cargando mejores hiperparámetros previos:", BEST_HP_FILE.name)
            base = get_xgb_defaults()
            base.update(best)
            return base, True
        except Exception as e:
            print("[HP] Aviso: no se pudo leer BEST (uso defaults).", e)
    print("[HP] Usando hiperparámetros DEFAULT de XGB.")
    return get_xgb_defaults(), False

seed_params, loaded_best_flag = load_best_or_default()

[HP] Cargando mejores hiperparámetros previos: BEST_XGB_FULL_SMOTENC.json


6 — Entrenamiento BASELINE + umbral

In [152]:
from xgboost import XGBClassifier

seed_params = dict(seed_params)

seed_params.setdefault("random_state", RANDOM_STATE)
seed_params.setdefault("n_jobs", -1)
seed_params.setdefault("eval_metric", "aucpr")
seed_params.setdefault("tree_method", "hist")
seed_params.setdefault("verbosity", 0)
seed_params.pop("verbose", None)

seed_params["n_estimators"] = seed_params.get("n_estimators") or 1000

if seed_params.get("n_estimators") is None:
    seed_params.pop("n_estimators", None)

keep_idx_global = None
feature_names_used = feature_names
X_train_fit, X_val_fit, X_test_fit = X_train, X_val, X_test

if USE_REDUCED:
    keep_idx_global, _mi = fit_mi_selector(X_train, y_train, topk=MI_TOPK, seed=RANDOM_STATE)
    X_train_fit = apply_keep_idx(X_train, keep_idx_global)
    X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
    X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)
    feature_names_used = [feature_names[i] for i in keep_idx_global]

X_train_final, y_train_final = X_train_fit, y_train
if USE_BALANCED_TRAIN:
    X_train_final, y_train_final = maybe_smote(X_train_fit, y_train, keep_idx=keep_idx_global)

model = XGBClassifier(**seed_params)
model, best_iter = xgb_fit_with_es(
    model,
    X_train_final, y_train_final,
    X_val_fit, y_val,
    feature_names=feature_names_used,
    rounds=200,
    verbose=False
)
print(f"[BASELINE] best_iteration: {best_iter}")

proba_val = model.predict_proba(X_val_fit)[:, 1]
thr_val, best_f1_val = find_best_threshold(y_val, proba_val, metric="f1")
print(f"[BASELINE] Mejor umbral (val) por F1: {thr_val:.3f} | F1(val)={best_f1_val:.4f}")

val_metrics = compute_all_metrics(y_val, proba_val, thr_val)
print("[BASELINE] Métricas val:", {k: (round(v,4) if isinstance(v,float) else v) for k,v in val_metrics.items()})


baseline = model
base_best_it = best_iter
tuned_model = None

[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BASELINE] best_iteration: 1984
[BASELINE] Mejor umbral (val) por F1: 0.463 | F1(val)=0.6435
[BASELINE] Métricas val: {'pr_auc': 0.6963, 'roc_auc': 0.8596, 'precision': 0.6757, 'f1': 0.6435, 'recall': 0.6143, 'bal_acc': 0.7695}


7 — Optimización incremental (Optuna)

In [153]:
import optuna
from optuna.samplers import TPESampler

tuned_model = None
N_TRIALS = 40
STUDY_NAME = f"XGB_{VIEW_TAG}_{BAL_TAG}_AP"
SAMPLER = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
study = optuna.create_study(direction="maximize", study_name=STUDY_NAME, sampler=SAMPLER)

SEARCH_KEYS = [
    "learning_rate", "n_estimators", "max_depth", "min_child_weight",
    "subsample", "colsample_bytree", "gamma", "reg_alpha", "reg_lambda"
]

def suggest_xgb_params(trial):
    p = {}
    p["learning_rate"]    = trial.suggest_float("learning_rate", 1e-3, 0.3, log=True)
    p["n_estimators"]     = trial.suggest_int("n_estimators", 800, 3000, step=50)
    p["max_depth"]        = trial.suggest_int("max_depth", 3, 10)
    p["min_child_weight"] = trial.suggest_float("min_child_weight", 0.5, 20.0, log=True)
    p["subsample"]        = trial.suggest_float("subsample", 0.6, 1.0)
    p["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.6, 1.0)
    p["gamma"]            = trial.suggest_float("gamma", 1e-8, 5.0, log=True)
    p["reg_alpha"]        = trial.suggest_float("reg_alpha", 1e-6, 10.0, log=True)
    p["reg_lambda"]       = trial.suggest_float("reg_lambda", 1e-6, 10.0, log=True)
    p["random_state"]     = RANDOM_STATE
    p["n_jobs"]           = -1
    p["eval_metric"]      = "aucpr"
    p["tree_method"]      = "hist"
    p["verbosity"]        = 0
    return p

# Warm-start
if BEST_HP_FILE.exists():
    try:
        prev = json.loads(BEST_HP_FILE.read_text())
        warm = {k: prev[k] for k in SEARCH_KEYS if k in prev}
        if warm:
            print("[OPTUNA] Enqueuing previous BEST as a trial seed.")
            study.enqueue_trial(warm)
    except Exception as e:
        print("[OPTUNA] Aviso: no se pudo usar BEST para warm-start:", e)

def objective(trial):
    hp = suggest_xgb_params(trial)
    mdl = XGBClassifier(**{**seed_params, **hp})
    mdl, best_it = xgb_fit_with_es(
        mdl,
        X_train_final, y_train_final,
        X_val_fit, y_val,
        feature_names=feature_names_used,
        rounds=200,
        verbose=False
    )
    proba_val_t = mdl.predict_proba(X_val_fit)[:, 1]
    ap = average_precision_score(y_val, proba_val_t)
    trial.set_user_attr("best_iteration", best_it)
    return ap

print(f"[OPTUNA] Iniciando estudio '{STUDY_NAME}' con {N_TRIALS} pruebas...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

best = study.best_trial
print(f"[OPTUNA] Mejor AP(val): {best.value:.6f}")
print(f"[OPTUNA] Params ganadores:", best.params)
print(f"[OPTUNA] best_iteration (del trial):", best.user_attrs.get("best_iteration"))

best_params = dict(best.params)
best_params.update({
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
    "eval_metric": "aucpr",
    "tree_method": "hist",
    "verbosity": 0
})
with open(BEST_HP_FILE, "w", encoding="utf-8") as f:
    json.dump(best_params, f, indent=2, ensure_ascii=False)
print("[OPTUNA] Guardado BEST en:", BEST_HP_FILE.name)

tuned_model = XGBClassifier(**best_params)
tuned_model, best_it = xgb_fit_with_es(
    tuned_model,
    X_train_final, y_train_final,
    X_val_fit, y_val,
    feature_names=feature_names_used,
    rounds=200,
    verbose=False
)
print("[OPTUNA] Reentreno final completado. best_iteration =", best_it)

[I 2025-12-09 18:18:50,347] A new study created in memory with name: XGB_FULL_SMOTENC_AP


[OPTUNA] Enqueuing previous BEST as a trial seed.
[OPTUNA] Iniciando estudio 'XGB_FULL_SMOTENC_AP' con 40 pruebas...


[I 2025-12-09 18:18:54,208] Trial 0 finished with value: 0.6962975223192452 and parameters: {'learning_rate': 0.024657174052728027, 'n_estimators': 2350, 'max_depth': 3, 'min_child_weight': 0.8535743139140601, 'subsample': 0.9369195277814497, 'colsample_bytree': 0.6938682402744633, 'gamma': 1.703067625690673e-06, 'reg_alpha': 0.0004414234817891946, 'reg_lambda': 2.355922278034951}. Best is trial 0 with value: 0.6962975223192452.
[I 2025-12-09 18:18:59,270] Trial 1 finished with value: 0.6896356488001449 and parameters: {'learning_rate': 0.008468008575248327, 'n_estimators': 2900, 'max_depth': 8, 'min_child_weight': 4.550475813202184, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 3.200866785899844e-08, 'reg_alpha': 1.156732719914599, 'reg_lambda': 0.016136341713591334}. Best is trial 0 with value: 0.6962975223192452.
[I 2025-12-09 18:19:00,257] Trial 2 finished with value: 0.6804528020112258 and parameters: {'learning_rate': 0.05675206026988748, 'n_est

[OPTUNA] Mejor AP(val): 0.698259
[OPTUNA] Params ganadores: {'learning_rate': 0.013570972114707725, 'n_estimators': 2450, 'max_depth': 4, 'min_child_weight': 1.0024202070808959, 'subsample': 0.8895969662409436, 'colsample_bytree': 0.6179844014785916, 'gamma': 0.005567832464942176, 'reg_alpha': 5.0781078798822983e-05, 'reg_lambda': 0.24267702102911723}
[OPTUNA] best_iteration (del trial): 1962
[OPTUNA] Guardado BEST en: BEST_XGB_FULL_SMOTENC.json
[OPTUNA] Reentreno final completado. best_iteration = 1962


8 — Cross-Validation (OOF) para baseline y tuned

In [None]:
def run_oof_cv_xgb(model_params, X, y, k_folds=CV_FOLDS, seed=RANDOM_STATE, exp_suffix="BASELINE"):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    oof_proba = np.zeros_like(y, dtype=float)
    fold_rows = []

    base = dict(model_params)
  
    base.pop("verbose", None)
    base.setdefault("verbosity", 0)
    base.setdefault("eval_metric", "aucpr")
    base.setdefault("tree_method", "hist")
    base.setdefault("random_state", seed)
    base.setdefault("n_jobs", -1)
    base["n_estimators"] = base.get("n_estimators") or 1000

    for f, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr0, X_va0 = X[tr_idx], X[va_idx]
        y_tr0, y_va0 = y[tr_idx], y[va_idx]


        keep_idx = None
        feat_names_fold = feature_names
        if USE_REDUCED:
            keep_idx, _ = fit_mi_selector(X_tr0, y_tr0, topk=MI_TOPK, seed=seed)
            X_tr0 = apply_keep_idx(X_tr0, keep_idx)
            X_va0 = apply_keep_idx(X_va0, keep_idx)
            feat_names_fold = [feature_names[i] for i in keep_idx]

        
        if BALANCE_IN_CV and USE_BALANCED_TRAIN:
            X_tr, y_tr = maybe_smote(X_tr0, y_tr0, keep_idx=keep_idx)
        else:
            X_tr, y_tr = X_tr0, y_tr0

   
        mdl = XGBClassifier(**base)
        adapter, best_it = xgb_fit_with_es(
            mdl, X_tr, y_tr, X_va0, y_va0,
            feature_names=feat_names_fold,
            rounds=200, verbose=False
        )

       
        proba_va = adapter.predict_proba(X_va0)[:, 1]
        oof_proba[va_idx] = proba_va

        fold_rows.append({
            "fold": f,
            "pr_auc": average_precision_score(y_va0, proba_va),
            "roc_auc": roc_auc_score(y_va0, proba_va),
            "best_iteration": best_it if best_it is not None else np.nan
        })

  
    oof_pr = average_precision_score(y, oof_proba)
    oof_roc = roc_auc_score(y, oof_proba)
    thr_oof, _ = find_best_threshold(y, oof_proba, metric="f1")
    y_oof_pred = (oof_proba >= thr_oof).astype(int)
    oof_f1  = f1_score(y, y_oof_pred, zero_division=0)
    oof_rec = recall_score(y, y_oof_pred, zero_division=0)
    oof_bal = balanced_accuracy_score(y, y_oof_pred)

    cv_tag = f"{EXP_NAME}_{exp_suffix}_CV{CV_FOLDS}"
    cv_csv = OUT_RESULTS / f"cv_summary_{cv_tag}.csv"
    folds_df = pd.DataFrame(fold_rows)
    agg_row = pd.DataFrame([{
        "fold": "OOF", "pr_auc": oof_pr, "roc_auc": oof_roc,
        "thr": thr_oof, "f1": oof_f1, "recall": oof_rec, "bal_acc": oof_bal
    }])
    pd.concat([folds_df, agg_row], ignore_index=True).to_csv(cv_csv, index=False)

    oof_path = OUT_PREDS / f"oof_{cv_tag}.parquet"
    pd.DataFrame({"oof_proba": oof_proba, "y_true": y}).to_parquet(oof_path, index=False)

    print(f"[CV-{exp_suffix}] Guardados: {cv_csv.name} | {oof_path.name}")
    return {"oof_pr_auc": oof_pr, "oof_roc_auc": oof_roc, "thr": thr_oof,
            "oof_f1": oof_f1, "oof_recall": oof_rec, "oof_bal_acc": oof_bal}


cv_baseline = None
cv_tuned = None

if DO_CV_BASELINE:
    cv_baseline = run_oof_cv_xgb(seed_params, X_train_fit, y_train, exp_suffix="BASELINE")

if DO_CV_TUNED and tuned_model is not None:
   
    cv_tuned = run_oof_cv_xgb(best_params, X_train_fit, y_train, exp_suffix="TUNED")

if lgbm_tuned is not None:

    (OUT_PARAMS_L / f"{EXP_NAME_LGB}_TUNED_fitted_params.json").write_text(
        json.dumps(
            {
                "best_iteration": getattr(lgbm_tuned, "best_iteration", None),
                **(lgbm_best_params or {})
            },
            indent=2,
            ensure_ascii=False
        ),
        encoding="utf-8"
    )

    proba_test_tl = lgbm_tuned.predict_proba(X_test_fit)[:,1]
    y_pred_test_tl = (proba_test_tl >= thr_val_tl).astype(int)
    val_metrics_tl = compute_all_metrics(y_val,  proba_val_tl, thr_val_tl)
    test_metrics_tl= compute_all_metrics(y_test, proba_test_tl, thr_val_tl)

    row_tuned_lgb = {
        "model": f"{EXP_NAME_LGB}_TUNED",
        "thr_val": float(thr_val_tl),
        "thr_oof": float("nan"),
        "thr_used": float(thr_val_tl),
        "val_pr_auc": val_metrics_tl["pr_auc"],
        "val_roc_auc": val_metrics_tl["roc_auc"],
        "val_precision": val_metrics_tl["precision"],
        "val_f1": val_metrics_tl["f1"],
        "val_recall": val_metrics_tl["recall"],
        "val_bal_acc": val_metrics_tl["bal_acc"],
        "test_pr_auc": test_metrics_tl["pr_auc"],
        "test_roc_auc": test_metrics_tl["roc_auc"],
        "test_precision": test_metrics_tl["precision"],
        "test_f1": test_metrics_tl["f1"],
        "test_recall": test_metrics_tl["recall"],
        "test_bal_acc": test_metrics_tl["bal_acc"],
        "best_iteration": getattr(lgbm_tuned, "best_iteration", float("nan"))
    }
    csv_l = OUT_RESULTS_L / "baselines.csv"
    pd.DataFrame([row_tuned_lgb]).to_csv(
        csv_l, mode=("a" if csv_l.exists() else "w"), index=False, header=not csv_l.exists()
    )

    print(f"[OK][LGBM TUNED] Guardados en {bl}")

[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[CV-BASELINE] Guardados: cv_summary_XGB_FULL_SMOTENC_BASELINE_CV5.csv | oof_XGB_FULL_SMOTENC_BASELINE_CV5.parquet
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[CV-TUNED] Guardados: cv_summary_XGB_FULL_SMOTENC_TUNED_CV5.csv | oof_XGB_FULL_SMOTENC_TUNED_CV5.parquet


9 — Evaluación en test + guardados

In [None]:
def _ensure_probs():
    # XGB: usar TUNED si existe; si no, BASELINE
    p_val_xgb  = locals().get("proba_val_tx",  None)
    p_test_xgb = locals().get("proba_test_tx", None)
    if p_val_xgb is None or p_test_xgb is None:
        p_val_xgb  = locals().get("proba_val_xgb",  None)
        p_test_xgb = locals().get("proba_test_xgb", None)

    # LGBM: usar TUNED si existe; si no, BASELINE
    p_val_lgb  = locals().get("proba_val_tl",  None)
    p_test_lgb = locals().get("proba_test_tl", None)
    if p_val_lgb is None or p_test_lgb is None:
        p_val_lgb  = locals().get("proba_val_lgb",  None)
        p_test_lgb = locals().get("proba_test_lgb", None)

    if any(v is None for v in [p_val_xgb, p_test_xgb, p_val_lgb, p_test_lgb]):
        raise RuntimeError("No se encontraron todas las probabilidades requeridas (XGB y LGBM val/test).")

    return p_val_xgb, p_test_xgb, p_val_lgb, p_test_lgb

def _logit(p, eps=1e-9):
    p = np.clip(p, eps, 1 - eps)
    return np.log(p / (1 - p))

def _sigmoid(z):
    return 1. / (1. + np.exp(-z))

def ens_probs(p1, p2, w=0.5, method="logit"):
    if method == "logit":
        z = w * _logit(p1) + (1 - w) * _logit(p2)
        return _sigmoid(z)
    elif method == "prob":
        return w * p1 + (1 - w) * p2
    else:
        raise ValueError("method debe ser 'logit' o 'prob'.")

# === Preparación ENS ===
be = DIRS["ens"]
OUT_RESULTS_E = be / "results"
OUT_FIGS_E    = be / "figs"
OUT_PREDS_E   = be / "preds"
OUT_PARAMS_E  = be / "best_params"

EXP_NAME_ENS = f"ENS_{VIEW_TAG}_{BAL_TAG}"

p_val_xgb, p_test_xgb, p_val_lgb, p_test_lgb = _ensure_probs()

# === Búsqueda de w para maximizar AP en validación ===
best = {"w": None, "ap": -1.0, "method": None}
for method in ["logit", "prob"]:
    for w in np.linspace(0.0, 1.0, 41):
        p_val_ens = ens_probs(p_val_xgb, p_val_lgb, w=w, method=method)
        ap = average_precision_score(y_val, p_val_ens)
        if ap > best["ap"]:
            best = {"w": float(w), "ap": float(ap), "method": method}

# Aplicar peso óptimo a test
p_val_ens  = ens_probs(p_val_xgb,  p_val_lgb,  w=best["w"], method=best["method"])
p_test_ens = ens_probs(p_test_xgb, p_test_lgb, w=best["w"], method=best["method"])

# Umbral por F1 en validación y métricas
thr_val_ens, _   = find_best_threshold(y_val, p_val_ens, metric="f1")
y_pred_test_ens  = (p_test_ens >= thr_val_ens).astype(int)
val_metrics_ens  = compute_all_metrics(y_val,  p_val_ens,  thr_val_ens)
test_metrics_ens = compute_all_metrics(y_test, p_test_ens, thr_val_ens)

# Guardados
with open(OUT_PARAMS_E / f"{EXP_NAME_ENS}_weights.json", "w", encoding="utf-8") as f:
    json.dump({"w_xgb": best["w"], "w_lgbm": 1 - best["w"], "method": best["method"], "val_ap": best["ap"]},
              f, indent=2, ensure_ascii=False)

# Plots
plot_pr_curve(y_val,  p_val_ens,  f"{EXP_NAME_ENS} — PR (val)",  OUT_FIGS_E / f"{EXP_NAME_ENS}_pr_val.png")
plot_pr_curve(y_test, p_test_ens, f"{EXP_NAME_ENS} — PR (test)", OUT_FIGS_E / f"{EXP_NAME_ENS}_pr_test.png")
plot_roc_curve(y_val,  p_val_ens,  f"{EXP_NAME_ENS} — ROC (val)",  OUT_FIGS_E / f"{EXP_NAME_ENS}_roc_val.png")
plot_roc_curve(y_test, p_test_ens, f"{EXP_NAME_ENS} — ROC (test)", OUT_FIGS_E / f"{EXP_NAME_ENS}_roc_test.png")

# Preds test
pd.DataFrame({"proba": p_test_ens, "y_true": y_test}).to_parquet(
    OUT_PREDS_E / f"preds_test_{EXP_NAME_ENS}.parquet", index=False
)

row_ens = {
    "model": EXP_NAME_ENS,
    "thr_val": float(thr_val_ens),
    "thr_oof": float("nan"),
    "thr_used": float(thr_val_ens),
    "val_pr_auc": val_metrics_ens["pr_auc"],
    "val_roc_auc": val_metrics_ens["roc_auc"],
    "val_precision": val_metrics_ens["precision"],
    "val_f1": val_metrics_ens["f1"],
    "val_recall": val_metrics_ens["recall"],
    "val_bal_acc": val_metrics_ens["bal_acc"],
    "test_pr_auc": test_metrics_ens["pr_auc"],
    "test_roc_auc": test_metrics_ens["roc_auc"],
    "test_precision": test_metrics_ens["precision"],
    "test_f1": test_metrics_ens["f1"],
    "test_recall": test_metrics_ens["recall"],
    "test_bal_acc": test_metrics_ens["bal_acc"],
    "w_xgb": best["w"],
    "w_lgbm": 1 - best["w"],
    "comb_method": best["method"]
}
csv_e = OUT_RESULTS_E / "baselines.csv"
pd.DataFrame([row_ens]).to_csv(csv_e, mode=("a" if csv_e.exists() else "w"),
                               index=False, header=not csv_e.exists())

print(f"[OK][ENS] w*XGB + (1-w)*LGBM con w={best['w']:.3f} (method={best['method']}) | AP(val)={best['ap']:.6f}")

[OK][BASE] Guardados: 
  - Seed HPs   : XGB_FULL_SMOTENC_BASE_seed_params.json 
  - Fitted HPs : XGB_FULL_SMOTENC_BASE_fitted_params.json 
  - Importancias: XGB_FULL_SMOTENC_feature_importances.csv 
  - Preds test  : preds_test_XGB_FULL_SMOTENC.parquet 
  - Baselines   : baselines.csv
[OK][TUNED] Guardados: 
  - Fitted HPs : XGB_FULL_SMOTENC_TUNED_fitted_params.json 
  - Importancias: XGB_FULL_SMOTENC_TUNED_feature_importances.csv 
  - Preds test  : preds_test_XGB_FULL_SMOTENC_TUNED.parquet 
  - Baselines   : baselines.csv


10 — Mejores resultados + resumen CV

In [None]:
def _get_params_for(model_key, tuned=True):
    if model_key == "xgb":
        if tuned and (locals().get("xgb_best_params") is not None):
            p = dict(xgb_best_params)
            p.update({"n_jobs": -1, "eval_metric": "aucpr", "tree_method": "hist", "verbosity": 0, "random_state": RANDOM_STATE})
            return p
        else:
            p = dict(xgb_params_seed)
            p.update({"n_jobs": -1, "eval_metric": "aucpr", "tree_method": "hist", "verbosity": 0, "random_state": RANDOM_STATE})
            return p
    elif model_key == "lgbm":
        if tuned and (locals().get("lgbm_best_params") is not None):
            p = dict(lgbm_best_params)
            p.update({"random_state": RANDOM_STATE})
            return p
        else:
            p = dict(lgbm_params_seed)
            p.update({"random_state": RANDOM_STATE})
            return p
    else:
        raise ValueError("model_key debe ser 'xgb' o 'lgbm'.")

def _fit_adapter(model_key, params, X_tr, y_tr, X_va, y_va):
    if model_key == "xgb":
        mdl = XGBClassifier(**params)
        return xgb_fit_with_es(mdl, X_tr, y_tr, X_va, y_va, feature_names=feature_names, rounds=200, verbose=False)
    else:
        mdl = LGBMClassifier(**params)
        return lgbm_fit_with_es(mdl, X_tr, y_tr, X_va, y_va, feature_names=feature_names, rounds=200, verbose=False)

def _cv_oof(model_key, tuned=True, n_splits=CV_FOLDS):
    X_dev = np.vstack([X_train, X_val])
    y_dev = np.concatenate([y_train, y_val])

    oof = np.zeros_like(y_dev, dtype=float)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    params = _get_params_for(model_key, tuned=tuned)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_dev, y_dev), 1):
        X_tr, y_tr = X_dev[tr_idx], y_dev[tr_idx]
        X_va, y_va = X_dev[va_idx], y_dev[va_idx]

        if BALANCE_IN_CV and USE_BALANCED_TRAIN:
            X_tr, y_tr = maybe_smote(X_tr, y_tr, keep_idx=None)

        adapter = _fit_adapter(model_key, params, X_tr, y_tr, X_va, y_va)
        oof[va_idx] = adapter.predict_proba(X_va)[:,1]
        print(f"[CV][{model_key.upper()}][{'TUNED' if tuned else 'BASE'}] fold {fold}/{n_splits} listo.")

    ap_oof = average_precision_score(y_dev, oof)
    thr_oof, _ = find_best_threshold(y_dev, oof, metric="f1")

    X_tr_final, y_tr_final = X_train, y_train
    if USE_BALANCED_TRAIN:
        X_tr_final, y_tr_final = maybe_smote(X_tr_final, y_tr_final, keep_idx=None)

    adapter_final = _fit_adapter(model_key, params, X_tr_final, y_tr_final, X_val, y_val)
    p_test = adapter_final.predict_proba(X_test)[:,1]
    y_pred_test = (p_test >= thr_oof).astype(int)

    val_metrics = compute_all_metrics(y_val, adapter_final.predict_proba(X_val)[:,1], thr_oof)
    test_metrics = compute_all_metrics(y_test, p_test, thr_oof)

    # Guardados
    subdir = DIRS[model_key]
    OUT_PREDS = subdir / "preds"
    OUT_RESULTS = subdir / "results"

    # OOF
    pd.DataFrame({"proba_oof": oof, "y_true": y_dev}).to_parquet(
        OUT_PREDS / f"oof_preds_{model_key}_{'TUNED' if tuned else 'BASE'}.parquet", index=False
    )

    row = {
        "model": f"{model_key.upper()}_{VIEW_TAG}_{BAL_TAG}_{'TUNED' if tuned else 'BASE'}_CV",
        "thr_val": float("nan"),
        "thr_oof": float(thr_oof),
        "thr_used": float(thr_oof),
        "oof_pr_auc": float(ap_oof),
        "val_pr_auc": val_metrics["pr_auc"],
        "val_f1": val_metrics["f1"],
        "test_pr_auc": test_metrics["pr_auc"],
        "test_f1": test_metrics["f1"]
    }
    csv_cv = OUT_RESULTS / "cv_summary.csv"
    pd.DataFrame([row]).to_csv(csv_cv, mode=("a" if csv_cv.exists() else "w"),
                               index=False, header=not csv_cv.exists())

    print(f"[CV][{model_key.upper()}][{'TUNED' if tuned else 'BASE'}] OOF AP={ap_oof:.6f} | thr_oof={thr_oof:.3f}")
    return oof, thr_oof, test_metrics

# Ejecutar según toggles
if DO_CV_BASELINE:
    _ = _cv_oof("xgb", tuned=False)
    _ = _cv_oof("lgbm", tuned=False)

if DO_CV_TUNED:
    if xgb_best_params is not None:
        _ = _cv_oof("xgb", tuned=True)
    if lgbm_best_params is not None:
        _ = _cv_oof("lgbm", tuned=True)

=== MEJORES EN TEST (por métrica) ===
- test_pr_auc: XGB_FULL_SMOTENC | PR-AUC=0.7055 | ROC-AUC=0.8588 | F1=0.6108 | Recall=0.5823 | Precision=0.6423 | thr(val)=0.463 | best_iter=1984
- test_roc_auc: XGB_FULL_SMOTENC | PR-AUC=0.7055 | ROC-AUC=0.8588 | F1=0.6108 | Recall=0.5823 | Precision=0.6423 | thr(val)=0.463 | best_iter=1984
- test_recall: XGB_FULL_SMOTENC_TUNED | PR-AUC=0.7054 | ROC-AUC=0.8583 | F1=0.6075 | Recall=0.6560 | Precision=0.5657 | thr(val)=0.373 | best_iter=1962
- test_f1: XGB_FULL_SMOTENC | PR-AUC=0.7055 | ROC-AUC=0.8588 | F1=0.6108 | Recall=0.5823 | Precision=0.6423 | thr(val)=0.463 | best_iter=1984
- test_precision: XGB_FULL_SMOTENC | PR-AUC=0.7055 | ROC-AUC=0.8588 | F1=0.6108 | Recall=0.5823 | Precision=0.6423 | thr(val)=0.463 | best_iter=1984
=== RESUMEN CV-OOF (por experimento) ===
                          tag   pr_auc  roc_auc       f1   recall  bal_acc   thr
XGB_FULL_SMOTENC_BASELINE_CV5 0.692154 0.860440 0.619421 0.646770 0.766864 0.441
   XGB_FULL_SMOTENC_TUN

11 — Resumen consolidado

In [None]:
from glob import glob

def _read_csv_safepath(path):
    return pd.read_csv(path) if Path(path).exists() else pd.DataFrame()

frames = []

# baselines
for key in ["xgb","lgbm","ens"]:
    csv_path = DIRS[key] / "results" / "baselines.csv"
    if csv_path.exists():
        df = pd.read_csv(csv_path)
        df["where"] = key
        frames.append(df)

# CV summaries
for key in ["xgb","lgbm"]:
    cv_path = DIRS[key] / "results" / "cv_summary.csv"
    if cv_path.exists():
        df = pd.read_csv(cv_path)
        df["where"] = f"{key}_cv"
        frames.append(df)

summary = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
summary_cols = [c for c in [
    "model","where",
    "thr_val","thr_oof","thr_used",
    "val_pr_auc","val_roc_auc","val_f1","val_precision","val_recall","val_bal_acc",
    "test_pr_auc","test_roc_auc","test_f1","test_precision","test_recall","test_bal_acc",
    "oof_pr_auc"
] if c in summary.columns]

summary = summary[summary_cols].sort_values(
    ["test_pr_auc","val_pr_auc","oof_pr_auc"], ascending=[False, False, False]
).reset_index(drop=True)

SUM_DIR = ARTIF_ROOT / "summary"
SUM_DIR.mkdir(parents=True, exist_ok=True)
summary_fp = SUM_DIR / f"summary_{VIEW_TAG}_{BAL_TAG}.csv"
summary.to_csv(summary_fp, index=False)

display(summary.head(20))
print(f"[OK][SUMMARY] Guardado en: {summary_fp}")

12 — Export para producción + loader utilitario

In [None]:
EXPORT_DIR = ARTIF_ROOT / "export"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# Elegimos TUNED si están disponibles
_final_xgb = locals().get("xgb_tuned",  locals().get("xgb_baseline_model", None))
_final_lgb = locals().get("lgbm_tuned", locals().get("lgbm_adapter", None))

# XGB export
try:
    if _final_xgb is not None:
        booster = _final_xgb.get_booster()
        booster.save_model(str(EXPORT_DIR / f"xgb_{VIEW_TAG}_{BAL_TAG}.json"))
        print("[EXPORT] XGB guardado como JSON.")
except Exception as e:
    print("[EXPORT][XGB] Aviso:", e)

# LGBM export
try:
    if _final_lgb is not None and hasattr(_final_lgb, "booster_"):
        _final_lgb.booster_.save_model(str(EXPORT_DIR / f"lgbm_{VIEW_TAG}_{BAL_TAG}.txt"))
        print("[EXPORT] LGBM guardado como TXT.")
except Exception as e:
    print("[EXPORT][LGBM] Aviso:", e)

# Guardar pesos del ensamble
ens_params = OUT_PARAMS_E / f"{EXP_NAME_ENS}_weights.json"
if ens_params.exists():
    print("[EXPORT] Ensamble (pesos) disponible en:", ens_params)

# === Loader utilitario ===
def load_xgb_lgbm(export_dir: Path = EXPORT_DIR):
    """
    Devuelve (xgb_booster, lgbm_booster, ens_cfg_dict or None)
    """
    xgb_path = export_dir / f"xgb_{VIEW_TAG}_{BAL_TAG}.json"
    lgb_path = export_dir / f"lgbm_{VIEW_TAG}_{BAL_TAG}.txt"
    ens_path = OUT_PARAMS_E / f"{EXP_NAME_ENS}_weights.json"

    booster_xgb = None
    booster_lgb = None
    ens_cfg = None

    if xgb_path.exists():
        booster_xgb = xgb.Booster()
        booster_xgb.load_model(str(xgb_path))

    if lgb_path.exists():
        booster_lgb = lgb.Booster(model_file=str(lgb_path))

    if ens_path.exists():
        ens_cfg = json.loads(ens_path.read_text(encoding="utf-8"))

    return booster_xgb, booster_lgb, ens_cfg

def predict_with_loaded(booster_xgb, booster_lgb, X, ens_cfg=None):

    out = {}
    if booster_xgb is not None:
        px = booster_xgb.predict(xgb.DMatrix(X, feature_names=feature_names))
        out["xgb"] = px
    if booster_lgb is not None:
        pl = booster_lgb.predict(X, num_iteration=booster_lgb.best_iteration)
        out["lgbm"] = pl
    if ens_cfg is not None and ("xgb" in out) and ("lgbm" in out):
        w = float(ens_cfg.get("w_xgb", 0.5))
        method = ens_cfg.get("method", "logit")
        p1, p2 = out["xgb"], out["lgbm"]
        out["ens"] = ens_probs(p1, p2, w=w, method=method)
    return out

print("[OK] Export y loader utilitario listos.")