1 — Imports, config y rutas

In [1]:
import json, os, warnings, time, re
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Métricas / CV / Modelos
from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, recall_score, balanced_accuracy_score, confusion_matrix, precision_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Balanceo
try:
    from imblearn.over_sampling import SMOTENC, SMOTE
    _HAS_IMBLEARN = True
except Exception:
    _HAS_IMBLEARN = False

# Modelos base
import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm as lgb
from lightgbm import LGBMClassifier

# Optuna
import optuna
from optuna.samplers import TPESampler

USE_REDUCED = True
USE_BALANCED_TRAIN = True
BALANCE_IN_CV = True
RANDOM_STATE = 42
DO_TUNE_XGB = True
DO_TUNE_LGBM = True
DO_CV_BASELINE = True
DO_CV_TUNED = True
CV_FOLDS = 5
MI_TOPK = 30

# --- Localizador ---
def _auto_project_root():
    env_root = os.environ.get("PROJECT_ROOT")
    if env_root and (Path(env_root)/"preproc_datasets"/"full").exists():
        return Path(env_root)
    
    here = Path.cwd().resolve()
    candidates = [here, here.parent, here.parent.parent, here.parent.parent.parent]
    for base in candidates:
        if (base/"preproc_datasets"/"full"/"X_train_full.npy").exists():
            return base
        if (base/"preproc_datasets"/"full").exists():
            return base

    home_fallback = Path.home() / "Downloads" / "TFE Churn Bancario"
    if (home_fallback/"preproc_datasets"/"full").exists():
        return home_fallback

    return here

ROOT = _auto_project_root()

# === Rutas ===
DATA_DIR = ROOT / "preproc_datasets" / "full"
ARTIF_ROOT = ROOT / "artifacts"

# Tags
VIEW_TAG = "REDUCED" if USE_REDUCED else "FULL"
BAL_TAG  = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"

# Carpetas por modelo y ensamble
DIRS = {
    "xgb":   ARTIF_ROOT / f"XGB_{VIEW_TAG}_{BAL_TAG}",
    "lgbm":  ARTIF_ROOT / f"LGBM_{VIEW_TAG}_{BAL_TAG}",
    "rf":    ARTIF_ROOT / f"RF_{VIEW_TAG}_{BAL_TAG}",
    "ens":   ARTIF_ROOT / f"ENS_{VIEW_TAG}_{BAL_TAG}",
}
for k, base in DIRS.items():
    for sub in ["results", "figs", "preds", "best_params", "export"]:
        (base / sub).mkdir(parents=True, exist_ok=True)

print("ROOT:     ", ROOT)
print("DATA_DIR: ", DATA_DIR)
print("ARTIF_DIR:", ARTIF_ROOT)
print({k: str(v) for k, v in DIRS.items()})

ROOT:      /Users/luistejada/Downloads/TFE Churn Bancario
DATA_DIR:  /Users/luistejada/Downloads/TFE Churn Bancario/preproc_datasets/full
ARTIF_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts
{'xgb': '/Users/luistejada/Downloads/TFE Churn Bancario/artifacts/XGB_REDUCED_SMOTENC', 'lgbm': '/Users/luistejada/Downloads/TFE Churn Bancario/artifacts/LGBM_REDUCED_SMOTENC', 'rf': '/Users/luistejada/Downloads/TFE Churn Bancario/artifacts/RF_REDUCED_SMOTENC', 'ens': '/Users/luistejada/Downloads/TFE Churn Bancario/artifacts/ENS_REDUCED_SMOTENC'}


2 — Carga de datos y metadatos

In [2]:
def load_xy_full(dir_full: Path):
    expected = [
        dir_full / "X_train_full.npy",
        dir_full / "X_val_full.npy",
        dir_full / "X_test_full.npy",
        dir_full / "y_train.parquet",
        dir_full / "y_val.parquet",
        dir_full / "y_test.parquet",
        dir_full / "feature_names_full.parquet",
    ]
    missing = [p.name for p in expected if not p.exists()]
    if missing:
        listing = [p.name for p in dir_full.glob("*")]
        raise FileNotFoundError(
            f"No se encuentran estos archivos en {dir_full} -> {missing}\n"
            f"Contenido detectado: {listing}\n"
            f"Sugerencia: verifica que ROOT (impreso arriba) apunte a la carpeta del proyecto."
        )

    X_train = np.load(dir_full / "X_train_full.npy")
    X_val   = np.load(dir_full / "X_val_full.npy")
    X_test  = np.load(dir_full / "X_test_full.npy")

    y_train = pd.read_parquet(dir_full / "y_train.parquet")["Exited"].to_numpy()
    y_val   = pd.read_parquet(dir_full / "y_val.parquet")["Exited"].to_numpy()
    y_test  = pd.read_parquet(dir_full / "y_test.parquet")["Exited"].to_numpy()

    feature_names = pd.read_parquet(dir_full / "feature_names_full.parquet")["feature"].tolist()
    return X_train, y_train, X_val, y_val, X_test, y_test, feature_names

# Cargar datos
X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_xy_full(DATA_DIR)

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y:", y_train.shape, y_val.shape, y_test.shape)
print("n features:", len(feature_names))

# --- Metadatos para SMOTENC ---
def _read_feature_roles(dir_full: Path):
    candidates = [
        dir_full / "feature_roles_full.parquet",
        dir_full / "feature_roles.parquet",
        dir_full / "feature_meta_full.parquet",
        dir_full / "feature_meta.parquet",
        dir_full / "feature_types_full.parquet",
        dir_full / "feature_types.parquet",
        dir_full / "feature_meta.json",
    ]
    for p in candidates:
        if p.exists():
            if p.suffix == ".parquet":
                df = pd.read_parquet(p)
            elif p.suffix == ".json":
                obj = json.loads(p.read_text())
                if isinstance(obj, dict) and "features" in obj:
                    df = pd.DataFrame(obj["features"])
                else:
                    df = pd.DataFrame(obj)
            else:
                continue
            return df, p.name
    return None, None

def _build_cat_idx(feature_names, roles_df):
    if roles_df is None or len(roles_df) == 0:
        return []
    df = roles_df.copy()
    df.columns = [str(c).lower() for c in df.columns]
    if "feature" not in df.columns:
        if "name" in df.columns:
            df["feature"] = df["name"]
        else:
            return []
    cat_names = set()
    if "role" in df.columns:
        cat_names = set(df.loc[df["role"].astype(str).str.lower().isin(
            ["cat","categorical","bin","binary","ordinal"]), "feature"])
    elif "dtype" in df.columns:
        cat_names = set(df.loc[df["dtype"].astype(str).str.lower().isin(
            ["category","categorical","object","bool"]), "feature"])
    elif "is_cat" in df.columns:
        cat_names = set(df.loc[df["is_cat"].astype(bool), "feature"])
    else:
        return []
    idx = [i for i, f in enumerate(feature_names) if f in cat_names]
    return sorted(idx)

roles_df, meta_file = _read_feature_roles(DATA_DIR)
CAT_IDX_FULL = _build_cat_idx(feature_names, roles_df)
print(f"[META] Archivo detectado: {meta_file or 'N/D'} | columnas categóricas={len(CAT_IDX_FULL)}")
if CAT_IDX_FULL:
    print("[META] Ejemplo de índices categóricos:", CAT_IDX_FULL[:10], "...")
else:
    print("[META] No hay categóricas => caerá en SMOTE estándar.")

# === L1: cargar máscara desde XGB_REDUCED_SMOTENC ===
XGB_L1_FS_DIR = ARTIF_ROOT / "XGB_REDUCED_SMOTENC" / "best_params" / "feature_selection"
keep_idx_path = XGB_L1_FS_DIR / "keep_idx_L1.npy"

if not keep_idx_path.exists():
    raise FileNotFoundError(
        f"[L1] No se encontró keep_idx_L1.npy en {keep_idx_path}\n"
        f"Asegúrate de haber corrido antes el experimento XGB_REDUCED_SMOTENC con selección L1."
    )

# Índices de columnas a conservar según L1 (en el espacio FULL)
keep_idx_global = np.load(keep_idx_path).astype(int).tolist()
print(f"[L1] Máscara L1 cargada con {len(keep_idx_global)} features.")

# Helper para aplicar máscara
def apply_keep_idx(X, keep_idx):
    return X[:, keep_idx]

# Aplicar la máscara a los datasets FULL
X_train_fit = apply_keep_idx(X_train, keep_idx_global)
X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)

feature_names_used = [feature_names[i] for i in keep_idx_global]

print("[L1] Shapes reducidos:",
      "train", X_train_fit.shape,
      "| val", X_val_fit.shape,
      "| test", X_test_fit.shape)
print("[L1] Ejemplo de features usados:", feature_names_used[:15])

Shapes: (6000, 15) (2000, 15) (2000, 15)
y: (6000,) (2000,) (2000,)
n features: 15
[META] Archivo detectado: N/D | columnas categóricas=0
[META] No hay categóricas => caerá en SMOTE estándar.
[L1] Máscara L1 cargada con 12 features.
[L1] Shapes reducidos: train (6000, 12) | val (2000, 12) | test (2000, 12)
[L1] Ejemplo de features usados: ['num__CreditScore', 'num__Age', 'num__Tenure', 'num__Balance', 'num__EstimatedSalary', 'Geography_1', 'Gender_1', 'HasCrCard_1', 'IsActiveMember_1', 'NumOfProducts_1', 'NumOfProducts_2', 'NumOfProducts_3']


3 — Métricas, umbral, y plots

In [3]:
import platform
from pathlib import Path

def pr_auc(y_true, y_proba): 
    return float(average_precision_score(y_true, y_proba))

def roc_auc(y_true, y_proba): 
    return float(roc_auc_score(y_true, y_proba))

def find_best_threshold(y_true, y_proba, metric="f1"):
    thr_grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for thr in thr_grid:
        y_pred = (y_proba >= thr).astype(int)
        if metric == "f1":
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == "recall":
            score = recall_score(y_true, y_pred, zero_division=0)
        else:
            raise ValueError("metric no soportada")
        if score > best_score:
            best_score, best_thr = score, thr
    return float(best_thr), float(best_score)

def compute_all_metrics(y_true, y_proba, thr):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "pr_auc": pr_auc(y_true, y_proba),
        "roc_auc": roc_auc(y_true, y_proba),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "bal_acc": balanced_accuracy_score(y_true, y_pred)
    }

def plot_pr_curve(y_true, y_proba, title, out_path):
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.step(rec, prec, where='post')
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title(f'{title} (AP={ap:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_roc_curve(y_true, y_proba, title, out_path):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2)
    plt.plot([0,1],[0,1], 'k--', lw=1)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title(f'{title} (AUC={auc:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_confusion(y_true, y_pred, title, out_path, normalize=False):
    norm = 'true' if normalize else None
    cm = confusion_matrix(y_true, y_pred, normalize=norm)
    plt.figure(figsize=(5,4))
    im = plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    ticks = np.arange(2)
    plt.xticks(ticks, ['0','1']); plt.yticks(ticks, ['0','1'])
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            txt = f'{cm[i,j]:.2f}' if normalize else str(cm[i,j])
            plt.text(j, i, txt, ha='center', va='center',
                     color='white' if cm[i,j] > thresh else 'black')
    plt.ylabel('True label'); plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

4 — Helpers de balanceo

In [4]:
def _map_cat_idx_for_keep(keep_idx, cat_idx_full):
    if not cat_idx_full:
        return []
    if keep_idx is None:
        return sorted(cat_idx_full)
    pos = {old_i: j for j, old_i in enumerate(keep_idx)}
    return sorted([pos[i] for i in cat_idx_full if i in pos])

def apply_keep_idx(X, keep_idx):
    return X[:, keep_idx]

def maybe_smote(X, y, keep_idx=None, random_state=RANDOM_STATE, k_neighbors=5):
    if not _HAS_IMBLEARN:
        print("[BAL] imbalanced-learn no disponible. Se omite balanceo.")
        return X, y
    y_int = y.astype(int)
    if y_int.max() == 0:
        print("[BAL] Solo 1 clase en y. Se omite balanceo.")
        return X, y
    counts = np.bincount(y_int)
    if len(counts) < 2 or counts.min() < 2:
        print("[BAL] Minoría < 2 muestras. Se omite balanceo.")
        return X, y
    k = int(max(1, min(5, counts.min() - 1)))
    cat_idx = _map_cat_idx_for_keep(keep_idx, CAT_IDX_FULL)
    if cat_idx:
        sm = SMOTENC(categorical_features=cat_idx, k_neighbors=k, random_state=random_state)
        kind = "SMOTENC"
    else:
        sm = SMOTE(k_neighbors=k, random_state=random_state)
        kind = "SMOTE"
    X_res, y_res = sm.fit_resample(X, y)
    try:
        X_res = X_res.astype(X.dtype, copy=False)
    except Exception:
        pass
    print(f"[BAL] {kind} aplicado | k_neighbors={k} | cat_cols={len(cat_idx)}")
    return X_res, y_res


def _json_dump(path, obj):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def _floatify_metrics(d):
    return {k: float(v) for k, v in d.items()}

def write_minimal_loader(out_dir: Path):

    out_dir = Path(out_dir)
    code = r'''# -*- coding: utf-8 -*-
import json
from pathlib import Path
import numpy as np

def load_xgb_from_manifest(manifest_path):
    import xgboost as xgb
    mp = Path(manifest_path)
    m = json.loads(mp.read_text(encoding="utf-8"))
    mdl_file = mp.parent / m["files"]["model_json"]
    booster = xgb.Booster()
    booster.load_model(str(mdl_file))

    best_it = m.get("training", {}).get("best_iteration", None)
    feat_names = m["features"]["names"]

    def predict_proba(X):
        d = xgb.DMatrix(X, feature_names=feat_names)
        if best_it is not None and isinstance(best_it, int):
            p = booster.predict(d, iteration_range=(0, best_it+1))
        else:
            p = booster.predict(d)
        return np.column_stack([1.0 - p, p])

    return m, predict_proba

def load_lgbm_from_manifest(manifest_path):
    import lightgbm as lgb
    mp = Path(manifest_path)
    m = json.loads(mp.read_text(encoding="utf-8"))
    mdl_file = mp.parent / m["files"]["model_txt"]
    booster = lgb.Booster(model_file=str(mdl_file))

    feat_names = m["features"]["names"]
    best_it = m.get("training", {}).get("best_iteration", None)

    def predict_proba(X):
        p1 = booster.predict(X, num_iteration=best_it)
        return np.column_stack([1.0 - p1, p1])

    return m, predict_proba
'''
    (out_dir / "loader_example.py").write_text(code, encoding="utf-8")

def export_xgb(adapter, out_dir, exp_name, feature_names, threshold, val_metrics, test_metrics):

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    booster = adapter.get_booster()
    json_path = out_dir / f"{exp_name}.xgb.json"
    ubj_path  = out_dir / f"{exp_name}.xgb.ubj"

    booster.save_model(str(json_path))

    files = {"model_json": json_path.name}
    try:
        booster.save_model(str(ubj_path))
        files["model_ubj"] = ubj_path.name
    except Exception:
        pass

    try:
        import joblib
        joblib.dump(
            {"params": adapter.get_params(), "best_iteration": adapter.best_iteration},
            out_dir / f"{exp_name}.sk_params.joblib"
        )
        files["sk_params_joblib"] = f"{exp_name}.sk_params.joblib"
    except Exception:
        pass

    manifest = {
        "export_schema": "v1",
        "model_key": exp_name,
        "kind": "xgboost_binary_classifier",
        "framework": "xgboost",
        "framework_version": getattr(xgb, "__version__", "unknown"),
        "python_version": platform.python_version(),
        "files": files,
        "features": {
            "names": list(map(str, feature_names)),
            "dtype": "float32"
        },
        "inference": {
            "class_labels": [0, 1],
            "threshold": float(threshold)
        },
        "training": {
            "best_iteration": int(adapter.best_iteration) if adapter.best_iteration is not None else None
        },
        "metrics": {
            "validation": _floatify_metrics(val_metrics),
            "test": _floatify_metrics(test_metrics)
        }
    }
    _json_dump(out_dir / f"{exp_name}_MANIFEST.json", manifest)
    write_minimal_loader(out_dir)
    print(f"[EXPORT][XGB] {exp_name} -> {out_dir}")
    return manifest

def export_lgbm(adapter, out_dir, exp_name, feature_names, threshold, val_metrics, test_metrics):
    """
    Exporta:
      - Modelo LightGBM en TXT (formato de texto oficial).
      - Manifiesto JSON con nombres de features, umbral y métricas.
      - Loader de ejemplo.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    booster = adapter.booster_
    txt_path = out_dir / f"{exp_name}.lgbm.txt"
    booster.save_model(str(txt_path))

    manifest = {
        "export_schema": "v1",
        "model_key": exp_name,
        "kind": "lightgbm_binary_classifier",
        "framework": "lightgbm",
        "framework_version": getattr(lgb, "__version__", "unknown"),
        "python_version": platform.python_version(),
        "files": {
            "model_txt": txt_path.name
        },
        "features": {
            "names": list(map(str, feature_names)),
            "dtype": "float32"
        },
        "inference": {
            "class_labels": [0, 1],
            "threshold": float(threshold)
        },
        "training": {
            "best_iteration": int(adapter.best_iteration) if getattr(adapter, "best_iteration", None) is not None else None
        },
        "metrics": {
            "validation": _floatify_metrics(val_metrics),
            "test": _floatify_metrics(test_metrics)
        }
    }
    _json_dump(out_dir / f"{exp_name}_MANIFEST.json", manifest)
    write_minimal_loader(out_dir)
    print(f"[EXPORT][LGBM] {exp_name} -> {out_dir}")
    return manifest

5 — Adaptadores de entrenamiento con Early-Stopping (XGB & LGBM)

In [5]:
# --- XGBoost ---
class XGBAdapter:
    def __init__(self, booster, params, best_iteration, feature_names=None):
        self._booster = booster
        self._params = dict(params)
        self.best_iteration = best_iteration
        self._feature_names = feature_names

    def predict_proba(self, X):
        d = xgb.DMatrix(X, feature_names=self._feature_names)
        if self.best_iteration is not None:
            pred = self._booster.predict(d, iteration_range=(0, int(self.best_iteration)+1))
        else:
            pred = self._booster.predict(d)
        return np.column_stack([1.0 - pred, pred])

    def get_booster(self):
        return self._booster

    def get_params(self, deep=True):
        return dict(self._params)


def xgb_fit_with_es(sk_model, X_tr, y_tr, X_va, y_va, feature_names=None, rounds=200, verbose=False):
    p = sk_model.get_params()
    n_estimators = int(p.pop("n_estimators", 1000))
    seed = p.pop("random_state", p.pop("seed", RANDOM_STATE))
    nthread = p.pop("n_jobs", None)
    if nthread is not None:
        p["nthread"] = nthread
    p.setdefault("seed", seed)
    p.setdefault("objective", "binary:logistic")
    p.setdefault("eval_metric", "aucpr")
    dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=feature_names)
    dvalid = xgb.DMatrix(X_va, label=y_va, feature_names=feature_names)
    evals = [(dtrain, "train"), (dvalid, "valid")]
    booster = xgb.train(
        params=p, dtrain=dtrain, num_boost_round=n_estimators,
        evals=evals, early_stopping_rounds=rounds, verbose_eval=verbose
    )
    best_iter = getattr(booster, "best_iteration", None)
    return XGBAdapter(booster, {**sk_model.get_params(), "best_iteration": best_iter}, best_iter, feature_names)


def xgb_gain_importances(booster, feature_names):
    gain_dict = booster.get_score(importance_type="gain")
    name_to_idx = {n: i for i, n in enumerate(feature_names)}
    imp_gain = np.zeros(len(feature_names), dtype=float)
    for k, v in gain_dict.items():
        if k.startswith("f") and k[1:].isdigit():
            idx = int(k[1:])
        else:
            idx = name_to_idx.get(k, None)
        if idx is not None and 0 <= idx < len(imp_gain):
            imp_gain[idx] = v
    return imp_gain


# --- LightGBM ---
class LGBMAdapter:
    """Adapter consistente con .booster_ como atributo y .best_iteration."""
    def __init__(self, booster, params, best_iteration, feature_names=None):
        self.booster_ = booster
        self._params = dict(params)
        self.best_iteration = best_iteration
        self._feature_names = feature_names

    def predict_proba(self, X):
        p1 = self.booster_.predict(X, num_iteration=self.best_iteration)
        return np.column_stack([1.0 - p1, p1])

    def get_params(self, deep=True):
        return dict(self._params)


def lgbm_fit_with_es(sk_model, X_tr, y_tr, X_va, y_va, feature_names=None, rounds=200, verbose=False):
    p = dict(sk_model.get_params())
    num_boost_round = int(p.pop("n_estimators", 1000))

    # Defaults coherentes con AP
    lgb_params = {
        "objective": "binary",
        "metric": p.pop("metric", "average_precision"),
        "verbose": -1,
        "seed": p.pop("random_state", RANDOM_STATE),
    }

    # Mapear nombres scikit->lightgbm
    if "colsample_bytree" in p:
        lgb_params["feature_fraction"] = p.pop("colsample_bytree")
    if "subsample" in p:
        lgb_params["bagging_fraction"] = p.pop("subsample")
        lgb_params["bagging_freq"] = 1

    # Resto de hiperparámetros
    lgb_params.update(p)

    dtrain = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, free_raw_data=True)
    dvalid = lgb.Dataset(X_va, label=y_va, feature_name=feature_names, reference=dtrain, free_raw_data=True)

    booster = lgb.train(
        params=lgb_params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        callbacks=[lgb.early_stopping(stopping_rounds=rounds, verbose=verbose)]
    )
    best_iter = booster.best_iteration
    return LGBMAdapter(booster, {**sk_model.get_params(), "best_iteration": best_iter}, best_iter, feature_names)

6 — Carga/seed de hiperparámetros y defaults

In [6]:
def get_xgb_defaults(seed=RANDOM_STATE):
    mdl = XGBClassifier(
        random_state=seed, n_jobs=-1, eval_metric="aucpr",
        tree_method="hist", verbosity=0
    )
    p = mdl.get_params()
    p.pop("verbose", None)
    p.setdefault("verbosity", 0)
    p.setdefault("n_estimators", 1000)
    return p

def get_lgbm_defaults(seed=RANDOM_STATE):
    mdl = LGBMClassifier(
        random_state=seed, n_estimators=1000, metric="average_precision"
    )
    return mdl.get_params()

def _best_file(model_key):
    return DIRS[model_key] / "best_params" / f"BEST_{model_key.upper()}_{VIEW_TAG}_{BAL_TAG}.json"

def load_best_or_default(model_key):
    best_fp = _best_file(model_key)
    if best_fp.exists():
        try:
            best = json.loads(best_fp.read_text())
            print(f"[HP][{model_key}] Cargando BEST previo:", best_fp.name)
            base = get_xgb_defaults() if model_key=="xgb" else get_lgbm_defaults()
            base.update(best)
            return base, True
        except Exception as e:
            print(f"[HP][{model_key}] No se pudo leer BEST. Uso defaults. {e}")
    print(f"[HP][{model_key}] Usando defaults.")
    return (get_xgb_defaults() if model_key=="xgb" else get_lgbm_defaults()), False

xgb_params_seed, _ = load_best_or_default("xgb")
lgbm_params_seed, _ = load_best_or_default("lgbm")

[HP][xgb] Cargando BEST previo: BEST_XGB_REDUCED_SMOTENC.json
[HP][lgbm] Cargando BEST previo: BEST_LGBM_REDUCED_SMOTENC.json


7 — Entrenamiento BASELINE (XGB y LGBM) + umbral + guardados

In [7]:
if "X_train_fit" not in globals():
    X_train_fit, X_val_fit, X_test_fit = X_train, X_val, X_test

if "feature_names_used" not in globals():
    feature_names_used = feature_names

if "keep_idx_global" not in globals():
    keep_idx_global = None  # sin L1

# Balanceo global para baseline
X_train_final, y_train_final = X_train_fit, y_train
if USE_BALANCED_TRAIN:
    X_train_final, y_train_final = maybe_smote(
        X_train_fit,
        y_train,
        keep_idx=keep_idx_global
    )

# ---- XGB BASELINE ----
xgb_seed = dict(xgb_params_seed)
xgb_seed.setdefault("n_estimators", xgb_seed.get("n_estimators", 1000))
xgb_seed.setdefault("random_state", RANDOM_STATE)
xgb_seed.setdefault("n_jobs", -1)
xgb_seed.setdefault("eval_metric", "aucpr")
xgb_seed.setdefault("tree_method", "hist")
xgb_seed.setdefault("verbosity", 0)
xgb_seed.pop("verbose", None)

xgb_baseline_model = XGBClassifier(**xgb_seed)
xgb_baseline_model = xgb_fit_with_es(
    xgb_baseline_model, X_train_final, y_train_final,
    X_val_fit, y_val, feature_names=feature_names_used,
    rounds=200, verbose=False
)

# Validación
proba_val_xgb = xgb_baseline_model.predict_proba(X_val_fit)[:,1]
thr_val_xgb, best_f1_val_xgb = find_best_threshold(y_val, proba_val_xgb, metric="f1")
val_metrics_xgb = compute_all_metrics(y_val, proba_val_xgb, thr_val_xgb)

# Test
proba_test_xgb = xgb_baseline_model.predict_proba(X_test_fit)[:,1]
y_pred_test_xgb = (proba_test_xgb >= thr_val_xgb).astype(int)
test_metrics_xgb = compute_all_metrics(y_test, proba_test_xgb, thr_val_xgb)

# === Guardados ===
EXP_NAME_XGB = f"XGB_{VIEW_TAG}_{BAL_TAG}"
bx = DIRS["xgb"]
OUT_RESULTS_X = bx / "results"
OUT_FIGS_X    = bx / "figs"
OUT_PREDS_X   = bx / "preds"
OUT_PARAMS_X  = bx / "best_params"
OUT_EXPORT_X  = bx / "export"

# HP seed y "fitted"
with open(OUT_PARAMS_X / f"{EXP_NAME_XGB}_BASE_seed_params.json", "w", encoding="utf-8") as f:
    json.dump(xgb_seed, f, indent=2, ensure_ascii=False)

with open(OUT_PARAMS_X / f"{EXP_NAME_XGB}_BASE_fitted_params.json", "w", encoding="utf-8") as f:
    json.dump(xgb_baseline_model.get_params(), f, indent=2, ensure_ascii=False)

# Plots
plot_pr_curve(y_val,  proba_val_xgb,  f"{EXP_NAME_XGB} — PR (val)",  OUT_FIGS_X / f"{EXP_NAME_XGB}_pr_val.png")
plot_pr_curve(y_test, proba_test_xgb, f"{EXP_NAME_XGB} — PR (test)", OUT_FIGS_X / f"{EXP_NAME_XGB}_pr_test.png")
plot_roc_curve(y_val,  proba_val_xgb,  f"{EXP_NAME_XGB} — ROC (val)",  OUT_FIGS_X / f"{EXP_NAME_XGB}_roc_val.png")
plot_roc_curve(y_test, proba_test_xgb, f"{EXP_NAME_XGB} — ROC (test)", OUT_FIGS_X / f"{EXP_NAME_XGB}_roc_test.png")
plot_confusion(y_test, y_pred_test_xgb,
               f"{EXP_NAME_XGB} — Confusion (test @thr={thr_val_xgb:.3f})",
               OUT_FIGS_X / f"{EXP_NAME_XGB}_cm_test.png")

# Importancias en el espacio L1-reducido
try:
    booster = xgb_baseline_model.get_booster()
    imp_gain_x = xgb_gain_importances(booster, feature_names_used)
except Exception:
    imp_gain_x = getattr(xgb_baseline_model, "feature_importances_", np.zeros(len(feature_names_used)))

pd.DataFrame(
    {"feature": feature_names_used[:len(imp_gain_x)], "importance_gain": imp_gain_x}
).sort_values("importance_gain", ascending=False)\
 .to_csv(OUT_RESULTS_X / f"{EXP_NAME_XGB}_feature_importances.csv", index=False)

# Preds test
pd.DataFrame({"proba": proba_test_xgb, "y_true": y_test}).to_parquet(
    OUT_PREDS_X / f"preds_test_{EXP_NAME_XGB}.parquet", index=False
)

# Registro baselines.csv
best_iter_base_xgb = getattr(xgb_baseline_model, "best_iteration", None)
row_base_xgb = {
    "model": EXP_NAME_XGB,
    "thr_val": thr_val_xgb,
    "val_pr_auc": val_metrics_xgb["pr_auc"],
    "val_roc_auc": val_metrics_xgb["roc_auc"],
    "val_precision": val_metrics_xgb["precision"],
    "val_f1": val_metrics_xgb["f1"],
    "val_recall": val_metrics_xgb["recall"],
    "val_bal_acc": val_metrics_xgb["bal_acc"],
    "test_pr_auc": test_metrics_xgb["pr_auc"],
    "test_roc_auc": test_metrics_xgb["roc_auc"],
    "test_precision": test_metrics_xgb["precision"],
    "test_f1": test_metrics_xgb["f1"],
    "test_recall": test_metrics_xgb["recall"],
    "test_bal_acc": test_metrics_xgb["bal_acc"],
    "best_iteration": best_iter_base_xgb if best_iter_base_xgb is not None else np.nan
}
csv_x = OUT_RESULTS_X / "baselines.csv"
pd.DataFrame([row_base_xgb]).to_csv(csv_x, mode=("a" if csv_x.exists() else "w"),
                                    index=False, header=not csv_x.exists())

# === EXPORT: XGB BASE ===
export_xgb(
    adapter=xgb_baseline_model,
    out_dir=OUT_EXPORT_X,
    exp_name=f"{EXP_NAME_XGB}_BASE",
    feature_names=feature_names_used,
    threshold=thr_val_xgb,
    val_metrics=val_metrics_xgb,
    test_metrics=test_metrics_xgb
)
print("[OK][XGB BASE] Guardados en", bx)

# ---- LGBM BASELINE ----
lgbm_seed = dict(lgbm_params_seed)
lgbm_seed.setdefault("n_estimators", lgbm_seed.get("n_estimators", 1000))
lgbm_seed.setdefault("random_state", RANDOM_STATE)
lgbm_seed.setdefault("objective", "binary")
if ("metric" not in lgbm_seed) or (lgbm_seed["metric"] in (None, "", [], "None")):
    lgbm_seed["metric"] = "average_precision"
lgbm_seed.setdefault("n_jobs", -1)
lgbm_seed.setdefault("verbosity", -1)

lgbm_baseline_model = LGBMClassifier(**lgbm_seed)
lgbm_adapter = lgbm_fit_with_es(
    lgbm_baseline_model, X_train_final, y_train_final,
    X_val_fit, y_val, feature_names=feature_names_used,
    rounds=200, verbose=False
)

# Validación
proba_val_lgb = lgbm_adapter.predict_proba(X_val_fit)[:,1]
thr_val_lgb, best_f1_val_lgb = find_best_threshold(y_val, proba_val_lgb, metric="f1")
val_metrics_lgb = compute_all_metrics(y_val, proba_val_lgb, thr_val_lgb)

# Test
proba_test_lgb = lgbm_adapter.predict_proba(X_test_fit)[:,1]
y_pred_test_lgb = (proba_test_lgb >= thr_val_lgb).astype(int)
test_metrics_lgb = compute_all_metrics(y_test, proba_test_lgb, thr_val_lgb)

# === Guardados ===
EXP_NAME_LGB = f"LGBM_{VIEW_TAG}_{BAL_TAG}"
bl = DIRS["lgbm"]
OUT_RESULTS_L = bl / "results"
OUT_FIGS_L    = bl / "figs"
OUT_PREDS_L   = bl / "preds"
OUT_PARAMS_L  = bl / "best_params"
OUT_EXPORT_L  = bl / "export"

with open(OUT_PARAMS_L / f"{EXP_NAME_LGB}_BASE_seed_params.json", "w", encoding="utf-8") as f:
    json.dump(lgbm_seed, f, indent=2, ensure_ascii=False)
with open(OUT_PARAMS_L / f"{EXP_NAME_LGB}_BASE_fitted_params.json", "w", encoding="utf-8") as f:
    json.dump(lgbm_baseline_model.get_params(), f, indent=2, ensure_ascii=False)

# Plots
plot_pr_curve(y_val,  proba_val_lgb,  f"{EXP_NAME_LGB} — PR (val)",  OUT_FIGS_L / f"{EXP_NAME_LGB}_pr_val.png")
plot_pr_curve(y_test, proba_test_lgb, f"{EXP_NAME_LGB} — PR (test)", OUT_FIGS_L / f"{EXP_NAME_LGB}_pr_test.png")
plot_roc_curve(y_val,  proba_val_lgb,  f"{EXP_NAME_LGB} — ROC (val)",  OUT_FIGS_L / f"{EXP_NAME_LGB}_roc_val.png")
plot_roc_curve(y_test, proba_test_lgb, f"{EXP_NAME_LGB} — ROC (test)", OUT_FIGS_L / f"{EXP_NAME_LGB}_roc_test.png")
plot_confusion(y_test, y_pred_test_lgb,
               f"{EXP_NAME_LGB} — Confusion (test @thr_used={thr_val_lgb:.3f})",
               OUT_FIGS_L / f"{EXP_NAME_LGB}_cm_test.png")

# Importancias LGBM (gain) en el espacio L1-reducido
try:
    imp_gain_l = lgbm_adapter.booster_.feature_importance(importance_type="gain")
except Exception:
    imp_gain_l = np.zeros(len(feature_names_used))

pd.DataFrame(
    {"feature": feature_names_used[:len(imp_gain_l)], "importance_gain": imp_gain_l}
).sort_values("importance_gain", ascending=False)\
 .to_csv(OUT_RESULTS_L / f"{EXP_NAME_LGB}_feature_importances.csv", index=False)

# Preds test
pd.DataFrame({"proba": proba_test_lgb, "y_true": y_test})\
  .to_parquet(OUT_PREDS_L / f"preds_test_{EXP_NAME_LGB}.parquet", index=False)

# Registro baselines.csv (LGBM)
row_base_lgb = {
    "model": EXP_NAME_LGB,
    "thr_val": thr_val_lgb,
    "thr_oof": np.nan,
    "thr_used": thr_val_lgb,
    "val_pr_auc": val_metrics_lgb["pr_auc"],
    "val_roc_auc": val_metrics_lgb["roc_auc"],
    "val_precision": val_metrics_lgb["precision"],
    "val_f1": val_metrics_lgb["f1"],
    "val_recall": val_metrics_lgb["recall"],
    "val_bal_acc": val_metrics_lgb["bal_acc"],
    "test_pr_auc": test_metrics_lgb["pr_auc"],
    "test_roc_auc": test_metrics_lgb["roc_auc"],
    "test_precision": test_metrics_lgb["precision"],
    "test_f1": test_metrics_lgb["f1"],
    "test_recall": test_metrics_lgb["recall"],
    "test_bal_acc": test_metrics_lgb["bal_acc"],
    "best_iteration": lgbm_adapter.best_iteration if hasattr(lgbm_adapter, "best_iteration") else np.nan
}
csv_l = OUT_RESULTS_L / "baselines.csv"
pd.DataFrame([row_base_lgb]).to_csv(
    csv_l,
    mode=("a" if csv_l.exists() else "w"),
    index=False,
    header=not csv_l.exists()
)

# === EXPORT: LGBM BASE ===
export_lgbm(
    adapter=lgbm_adapter,
    out_dir=OUT_EXPORT_L,
    exp_name=f"{EXP_NAME_LGB}_BASE",
    feature_names=feature_names_used,
    threshold=thr_val_lgb,
    val_metrics=val_metrics_lgb,
    test_metrics=test_metrics_lgb
)
print("[OK][LGBM BASE] Guardados en", bl)

[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[EXPORT][XGB] XGB_REDUCED_SMOTENC_BASE -> /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/XGB_REDUCED_SMOTENC/export
[OK][XGB BASE] Guardados en /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/XGB_REDUCED_SMOTENC
[EXPORT][LGBM] LGBM_REDUCED_SMOTENC_BASE -> /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/LGBM_REDUCED_SMOTENC/export
[OK][LGBM BASE] Guardados en /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/LGBM_REDUCED_SMOTENC


8 — Optuna incremental (XGB y LGBM), re-entreno TUNED + guardados

In [8]:
def tune_xgb_with_optuna(seed_params, X_train_final, y_train_final, X_val_fit, y_val, feature_names_used):
    """
    Tuning de XGBoost maximizando AP(val) con Optuna.
    Devuelve:
      - adapter entrenado (XGBAdapter)
      - best_params (dict con hiperparámetros finales)
    """
    N_TRIALS = 40
    STUDY_NAME = f"XGB_{VIEW_TAG}_{BAL_TAG}_AP"
    sampler = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
    study = optuna.create_study(direction="maximize",
                                study_name=STUDY_NAME,
                                sampler=sampler)

    SEARCH_KEYS = [
        "learning_rate", "n_estimators", "max_depth", "min_child_weight",
        "subsample", "colsample_bytree", "gamma", "reg_alpha", "reg_lambda"
    ]

    best_fp = _best_file("xgb")
    if best_fp.exists():
        try:
            prev = json.loads(best_fp.read_text())
            warm = {k: prev[k] for k in SEARCH_KEYS if k in prev}
            if warm:
                print("[OPTUNA][XGB] Enqueue BEST anterior.")
                study.enqueue_trial(warm)
        except Exception as e:
            print("[OPTUNA][XGB] Aviso warm-start:", e)

    def suggest(trial):
        p = {}
        p["learning_rate"]    = trial.suggest_float("learning_rate", 1e-3, 0.3, log=True)
        p["n_estimators"]     = trial.suggest_int("n_estimators", 800, 3000, step=50)
        p["max_depth"]        = trial.suggest_int("max_depth", 3, 10)
        p["min_child_weight"] = trial.suggest_float("min_child_weight", 0.5, 20.0, log=True)
        p["subsample"]        = trial.suggest_float("subsample", 0.6, 1.0)
        p["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.6, 1.0)
        p["gamma"]            = trial.suggest_float("gamma", 1e-8, 5.0, log=True)
        p["reg_alpha"]        = trial.suggest_float("reg_alpha", 1e-6, 10.0, log=True)
        p["reg_lambda"]       = trial.suggest_float("reg_lambda", 1e-6, 10.0, log=True)
        # Fijos
        p["random_state"]     = RANDOM_STATE
        p["n_jobs"]           = -1
        p["eval_metric"]      = "aucpr"
        p["tree_method"]      = "hist"
        p["verbosity"]        = 0
        return p

    def objective(trial):
        hp = suggest(trial)
        mdl = XGBClassifier(**{**seed_params, **hp})
        mdl = xgb_fit_with_es(
            mdl,
            X_train_final, y_train_final,
            X_val_fit, y_val,
            feature_names=feature_names_used,
            rounds=200,
            verbose=False
        )
        proba_val = mdl.predict_proba(X_val_fit)[:, 1]
        ap = average_precision_score(y_val, proba_val)
        trial.set_user_attr("best_iteration", getattr(mdl, "best_iteration", None))
        return ap

    print(f"[OPTUNA][XGB] Iniciando {N_TRIALS} pruebas...")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

    best = study.best_trial
    print(f"[OPTUNA][XGB] Mejor AP(val): {best.value:.6f}")

    best_params = dict(best.params)
    best_params.update({
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
        "eval_metric": "aucpr",
        "tree_method": "hist",
        "verbosity": 0,
    })

    # Guardamos BEST global
    with open(best_fp, "w", encoding="utf-8") as f:
        json.dump(best_params, f, indent=2, ensure_ascii=False)

    # Re-entrenamos con los mejores hiperparámetros
    tuned_clf = XGBClassifier(**{**seed_params, **best_params})
    tuned_adapter = xgb_fit_with_es(
        tuned_clf,
        X_train_final, y_train_final,
        X_val_fit, y_val,
        feature_names=feature_names_used,
        rounds=200,
        verbose=False
    )
    return tuned_adapter, best_params


def tune_lgbm_with_optuna(seed_params, X_train_final, y_train_final, X_val_fit, y_val, feature_names_used):
    N_TRIALS = 40
    STUDY_NAME = f"LGBM_{VIEW_TAG}_{BAL_TAG}_AP"
    sampler = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
    study = optuna.create_study(direction="maximize",
                                study_name=STUDY_NAME,
                                sampler=sampler)

    SEARCH_KEYS = [
        "learning_rate", "n_estimators", "num_leaves", "max_depth",
        "min_child_samples", "subsample", "colsample_bytree",
        "reg_alpha", "reg_lambda"
    ]

    best_fp = _best_file("lgbm")
    if best_fp.exists():
        try:
            prev = json.loads(best_fp.read_text())
            warm = {k: prev[k] for k in SEARCH_KEYS if k in prev}
            if warm:
                print("[OPTUNA][LGBM] Enqueue BEST anterior.")
                study.enqueue_trial(warm)
        except Exception as e:
            print("[OPTUNA][LGBM] Aviso warm-start:", e)

    def suggest(trial):
        p = {}
        p["learning_rate"]     = trial.suggest_float("learning_rate", 1e-3, 0.3, log=True)
        p["n_estimators"]      = trial.suggest_int("n_estimators", 800, 3000, step=50)
        p["num_leaves"]        = trial.suggest_int("num_leaves", 16, 256)
        p["max_depth"]         = trial.suggest_int("max_depth", -1, 12)
        p["min_child_samples"] = trial.suggest_int("min_child_samples", 5, 200)
        p["subsample"]         = trial.suggest_float("subsample", 0.6, 1.0)
        p["colsample_bytree"]  = trial.suggest_float("colsample_bytree", 0.6, 1.0)
        p["reg_alpha"]         = trial.suggest_float("reg_alpha", 1e-6, 10.0, log=True)
        p["reg_lambda"]        = trial.suggest_float("reg_lambda", 1e-6, 10.0, log=True)
        return p

    def objective(trial):
        hp = suggest(trial)
        params = {**seed_params, **hp}
        params.setdefault("objective", "binary")
        params.setdefault("metric", "average_precision")
        params.setdefault("random_state", RANDOM_STATE)
        params.setdefault("n_jobs", -1)

        base_clf = LGBMClassifier(**params)
        adapter = lgbm_fit_with_es(
            base_clf,
            X_train_final, y_train_final,
            X_val_fit, y_val,
            feature_names=feature_names_used,
            rounds=200,
            verbose=False
        )
        proba_val = adapter.predict_proba(X_val_fit)[:, 1]
        ap = average_precision_score(y_val, proba_val)
        trial.set_user_attr("best_iteration", adapter.best_iteration)
        return ap

    print(f"[OPTUNA][LGBM] Iniciando {N_TRIALS} pruebas...")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

    best = study.best_trial
    print(f"[OPTUNA][LGBM] Mejor AP(val): {best.value:.6f}")

    best_params = dict(best.params)
    best_params.update({
        "objective": "binary",
        "metric": "average_precision",
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
    })

    # Guardamos BEST global
    with open(best_fp, "w", encoding="utf-8") as f:
        json.dump(best_params, f, indent=2, ensure_ascii=False)

    tuned_base = LGBMClassifier(**{**seed_params, **best_params})
    tuned_adapter = lgbm_fit_with_es(
        tuned_base,
        X_train_final, y_train_final,
        X_val_fit, y_val,
        feature_names=feature_names_used,
        rounds=200,
        verbose=False
    )
    return tuned_adapter, best_params


# ============================================================
#   EJECUCIÓN DEL TUNING + RE-ENTRENO + GUARDADO DE preds_val_*
# ============================================================

# Por si acaso, definimos nombres base
EXP_NAME_XGB = f"XGB_{VIEW_TAG}_{BAL_TAG}"
EXP_NAME_LGB = f"LGBM_{VIEW_TAG}_{BAL_TAG}"

# Directorios para XGB
bx = DIRS["xgb"]
OUT_RESULTS_X = bx / "results"
OUT_FIGS_X    = bx / "figs"
OUT_PREDS_X   = bx / "preds"
OUT_PARAMS_X  = bx / "best_params"
OUT_EXPORT_X  = bx / "export"

# Directorios para LGBM
bl = DIRS["lgbm"]
OUT_RESULTS_L = bl / "results"
OUT_FIGS_L    = bl / "figs"
OUT_PREDS_L   = bl / "preds"
OUT_PARAMS_L  = bl / "best_params"
OUT_EXPORT_L  = bl / "export"

# === XGB TUNED ===
if DO_TUNE_XGB:
    xgb_tuned_adapter, xgb_best_params = tune_xgb_with_optuna(
        xgb_params_seed,
        X_train_final, y_train_final,
        X_val_fit, y_val,
        feature_names_used
    )
    xgb_tuned_model = xgb_tuned_adapter  

    # Predicciones y métricas en VAL
    proba_val_xgb_tuned = xgb_tuned_adapter.predict_proba(X_val_fit)[:, 1]
    thr_val_xgb_tuned, _ = find_best_threshold(y_val, proba_val_xgb_tuned, metric="f1")
    val_metrics_xgb_tuned = compute_all_metrics(y_val, proba_val_xgb_tuned, thr_val_xgb_tuned)

    # Predicciones y métricas en TEST
    proba_test_xgb_tuned = xgb_tuned_adapter.predict_proba(X_test_fit)[:, 1]
    test_metrics_xgb_tuned = compute_all_metrics(y_test, proba_test_xgb_tuned, thr_val_xgb_tuned)
    y_pred_test_xgb_tuned = (proba_test_xgb_tuned >= thr_val_xgb_tuned).astype(int)

    # --- GUARDAMOS preds_val_* y preds_test_* ---
    EXP_NAME_XGB_TUNED = f"{EXP_NAME_XGB}_TUNED"

    pd.DataFrame({"proba": proba_val_xgb_tuned, "y_true": y_val}).to_parquet(
        OUT_PREDS_X / f"preds_val_{EXP_NAME_XGB_TUNED}.parquet",
        index=False
    )
    pd.DataFrame({"proba": proba_test_xgb_tuned, "y_true": y_test}).to_parquet(
        OUT_PREDS_X / f"preds_test_{EXP_NAME_XGB_TUNED}.parquet",
        index=False
    )

    # Guardamos best_params específicos de TUNED
    with open(OUT_PARAMS_X / f"{EXP_NAME_XGB_TUNED}_best_params.json", "w", encoding="utf-8") as f:
        json.dump(xgb_best_params, f, indent=2, ensure_ascii=False)

    # Plots TUNED
    plot_pr_curve(y_val,  proba_val_xgb_tuned,
                  f"{EXP_NAME_XGB_TUNED} — PR (val)",
                  OUT_FIGS_X / f"{EXP_NAME_XGB_TUNED}_pr_val.png")
    plot_pr_curve(y_test, proba_test_xgb_tuned,
                  f"{EXP_NAME_XGB_TUNED} — PR (test)",
                  OUT_FIGS_X / f"{EXP_NAME_XGB_TUNED}_pr_test.png")
    plot_roc_curve(y_val,  proba_val_xgb_tuned,
                   f"{EXP_NAME_XGB_TUNED} — ROC (val)",
                   OUT_FIGS_X / f"{EXP_NAME_XGB_TUNED}_roc_val.png")
    plot_roc_curve(y_test, proba_test_xgb_tuned,
                   f"{EXP_NAME_XGB_TUNED} — ROC (test)",
                   OUT_FIGS_X / f"{EXP_NAME_XGB_TUNED}_roc_test.png")
    plot_confusion(
        y_test,
        y_pred_test_xgb_tuned,
        f"{EXP_NAME_XGB_TUNED} — Confusion (test @thr={thr_val_xgb_tuned:.3f})",
        OUT_FIGS_X / f"{EXP_NAME_XGB_TUNED}_cm_test.png"
    )

    # Registro tuned.csv
    best_iter_tuned_xgb = getattr(xgb_tuned_adapter, "best_iteration", None)
    row_tuned_xgb = {
        "model": EXP_NAME_XGB_TUNED,
        "thr_val": thr_val_xgb_tuned,
        "val_pr_auc": val_metrics_xgb_tuned["pr_auc"],
        "val_roc_auc": val_metrics_xgb_tuned["roc_auc"],
        "val_precision": val_metrics_xgb_tuned["precision"],
        "val_f1": val_metrics_xgb_tuned["f1"],
        "val_recall": val_metrics_xgb_tuned["recall"],
        "val_bal_acc": val_metrics_xgb_tuned["bal_acc"],
        "test_pr_auc": test_metrics_xgb_tuned["pr_auc"],
        "test_roc_auc": test_metrics_xgb_tuned["roc_auc"],
        "test_precision": test_metrics_xgb_tuned["precision"],
        "test_f1": test_metrics_xgb_tuned["f1"],
        "test_recall": test_metrics_xgb_tuned["recall"],
        "test_bal_acc": test_metrics_xgb_tuned["bal_acc"],
        "best_iteration": best_iter_tuned_xgb if best_iter_tuned_xgb is not None else np.nan,
    }
    csv_x_tuned = OUT_RESULTS_X / "tuned.csv"
    pd.DataFrame([row_tuned_xgb]).to_csv(
        csv_x_tuned,
        mode=("a" if csv_x_tuned.exists() else "w"),
        index=False,
        header=not csv_x_tuned.exists()
    )

    # Export XGB TUNED
    export_xgb(
        adapter=xgb_tuned_adapter,
        out_dir=OUT_EXPORT_X,
        exp_name=EXP_NAME_XGB_TUNED,
        feature_names=feature_names_used,
        threshold=thr_val_xgb_tuned,
        val_metrics=val_metrics_xgb_tuned,
        test_metrics=test_metrics_xgb_tuned,
    )


# === LGBM TUNED ===
if DO_TUNE_LGBM:
    lgbm_tuned_adapter, lgbm_best_params = tune_lgbm_with_optuna(
        lgbm_params_seed,
        X_train_final, y_train_final,
        X_val_fit, y_val,
        feature_names_used
    )
    # Alias de compatibilidad
    lgbm_tuned_model = lgbm_tuned_adapter  

    # Predicciones y métricas en VAL
    proba_val_lgb_tuned = lgbm_tuned_adapter.predict_proba(X_val_fit)[:, 1]
    thr_val_lgb_tuned, _ = find_best_threshold(y_val, proba_val_lgb_tuned, metric="f1")
    val_metrics_lgb_tuned = compute_all_metrics(y_val, proba_val_lgb_tuned, thr_val_lgb_tuned)

    # Predicciones y métricas en TEST
    proba_test_lgb_tuned = lgbm_tuned_adapter.predict_proba(X_test_fit)[:, 1]
    test_metrics_lgb_tuned = compute_all_metrics(y_test, proba_test_lgb_tuned, thr_val_lgb_tuned)
    y_pred_test_lgb_tuned = (proba_test_lgb_tuned >= thr_val_lgb_tuned).astype(int)

    EXP_NAME_LGB_TUNED = f"{EXP_NAME_LGB}_TUNED"

    pd.DataFrame({"proba": proba_val_lgb_tuned, "y_true": y_val}).to_parquet(
        OUT_PREDS_L / f"preds_val_{EXP_NAME_LGB_TUNED}.parquet",
        index=False
    )
    pd.DataFrame({"proba": proba_test_lgb_tuned, "y_true": y_test}).to_parquet(
        OUT_PREDS_L / f"preds_test_{EXP_NAME_LGB_TUNED}.parquet",
        index=False
    )

    with open(OUT_PARAMS_L / f"{EXP_NAME_LGB_TUNED}_best_params.json", "w", encoding="utf-8") as f:
        json.dump(lgbm_best_params, f, indent=2, ensure_ascii=False)

    # Plots TUNED
    plot_pr_curve(y_val,  proba_val_lgb_tuned,
                  f"{EXP_NAME_LGB_TUNED} — PR (val)",
                  OUT_FIGS_L / f"{EXP_NAME_LGB_TUNED}_pr_val.png")
    plot_pr_curve(y_test, proba_test_lgb_tuned,
                  f"{EXP_NAME_LGB_TUNED} — PR (test)",
                  OUT_FIGS_L / f"{EXP_NAME_LGB_TUNED}_pr_test.png")
    plot_roc_curve(y_val,  proba_val_lgb_tuned,
                   f"{EXP_NAME_LGB_TUNED} — ROC (val)",
                   OUT_FIGS_L / f"{EXP_NAME_LGB_TUNED}_roc_val.png")
    plot_roc_curve(y_test, proba_test_lgb_tuned,
                   f"{EXP_NAME_LGB_TUNED} — ROC (test)",
                   OUT_FIGS_L / f"{EXP_NAME_LGB_TUNED}_roc_test.png")
    plot_confusion(
        y_test,
        y_pred_test_lgb_tuned,
        f"{EXP_NAME_LGB_TUNED} — Confusion (test @thr={thr_val_lgb_tuned:.3f})",
        OUT_FIGS_L / f"{EXP_NAME_LGB_TUNED}_cm_test.png"
    )

    # Registro tuned.csv
    row_tuned_lgb = {
        "model": EXP_NAME_LGB_TUNED,
        "thr_val": thr_val_lgb_tuned,
        "val_pr_auc": val_metrics_lgb_tuned["pr_auc"],
        "val_roc_auc": val_metrics_lgb_tuned["roc_auc"],
        "val_precision": val_metrics_lgb_tuned["precision"],
        "val_f1": val_metrics_lgb_tuned["f1"],
        "val_recall": val_metrics_lgb_tuned["recall"],
        "val_bal_acc": val_metrics_lgb_tuned["bal_acc"],
        "test_pr_auc": test_metrics_lgb_tuned["pr_auc"],
        "test_roc_auc": test_metrics_lgb_tuned["roc_auc"],
        "test_precision": test_metrics_lgb_tuned["precision"],
        "test_f1": test_metrics_lgb_tuned["f1"],
        "test_recall": test_metrics_lgb_tuned["recall"],
        "test_bal_acc": test_metrics_lgb_tuned["bal_acc"],
        "best_iteration": lgbm_tuned_adapter.best_iteration if hasattr(lgbm_tuned_adapter, "best_iteration") else np.nan,
    }
    csv_l_tuned = OUT_RESULTS_L / "tuned.csv"
    pd.DataFrame([row_tuned_lgb]).to_csv(
        csv_l_tuned,
        mode=("a" if csv_l_tuned.exists() else "w"),
        index=False,
        header=not csv_l_tuned.exists()
    )

    # Export LGBM TUNED
    export_lgbm(
        adapter=lgbm_tuned_adapter,
        out_dir=OUT_EXPORT_L,
        exp_name=EXP_NAME_LGB_TUNED,
        feature_names=feature_names_used,
        threshold=thr_val_lgb_tuned,
        val_metrics=val_metrics_lgb_tuned,
        test_metrics=test_metrics_lgb_tuned,
    )

[I 2025-12-12 23:59:43,799] A new study created in memory with name: XGB_REDUCED_SMOTENC_AP


[OPTUNA][XGB] Enqueue BEST anterior.
[OPTUNA][XGB] Iniciando 40 pruebas...


[I 2025-12-12 23:59:44,959] Trial 0 finished with value: 0.6985261084210692 and parameters: {'learning_rate': 0.11606982623590888, 'n_estimators': 1900, 'max_depth': 3, 'min_child_weight': 0.9277474205244098, 'subsample': 0.7326900127737387, 'colsample_bytree': 0.8024358862026536, 'gamma': 4.187670208393132e-06, 'reg_alpha': 3.373539017033962e-06, 'reg_lambda': 2.261756498442988e-06}. Best is trial 0 with value: 0.6985261084210692.
[I 2025-12-12 23:59:49,028] Trial 1 finished with value: 0.6870151419929501 and parameters: {'learning_rate': 0.008468008575248327, 'n_estimators': 2900, 'max_depth': 8, 'min_child_weight': 4.550475813202184, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 3.200866785899844e-08, 'reg_alpha': 1.156732719914599, 'reg_lambda': 0.016136341713591334}. Best is trial 0 with value: 0.6985261084210692.
[I 2025-12-12 23:59:50,048] Trial 2 finished with value: 0.6815970793795767 and parameters: {'learning_rate': 0.05675206026988748, 'n_

[OPTUNA][XGB] Mejor AP(val): 0.698526


[I 2025-12-13 00:01:11,909] A new study created in memory with name: LGBM_REDUCED_SMOTENC_AP


[EXPORT][XGB] XGB_REDUCED_SMOTENC_TUNED -> /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/XGB_REDUCED_SMOTENC/export
[OPTUNA][LGBM] Enqueue BEST anterior.
[OPTUNA][LGBM] Iniciando 40 pruebas...


[I 2025-12-13 00:01:13,295] Trial 0 finished with value: 0.6880474586286291 and parameters: {'learning_rate': 0.04163472304640577, 'n_estimators': 900, 'num_leaves': 20, 'max_depth': 4, 'min_child_samples': 33, 'subsample': 0.8509340279497106, 'colsample_bytree': 0.802267100716837, 'reg_alpha': 0.5255105942888963, 'reg_lambda': 0.003706120281152843}. Best is trial 0 with value: 0.6880474586286291.
[I 2025-12-13 00:01:18,635] Trial 1 finished with value: 0.686645800041543 and parameters: {'learning_rate': 0.008468008575248327, 'n_estimators': 2900, 'num_leaves': 192, 'max_depth': 7, 'min_child_samples': 35, 'subsample': 0.662397808134481, 'colsample_bytree': 0.6232334448672797, 'reg_alpha': 1.156732719914599, 'reg_lambda': 0.016136341713591334}. Best is trial 0 with value: 0.6880474586286291.
[I 2025-12-13 00:01:20,871] Trial 2 finished with value: 0.6797808136383815 and parameters: {'learning_rate': 0.05675206026988748, 'n_estimators': 800, 'num_leaves': 249, 'max_depth': 10, 'min_chil

[OPTUNA][LGBM] Mejor AP(val): 0.693821
[EXPORT][LGBM] LGBM_REDUCED_SMOTENC_TUNED -> /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/LGBM_REDUCED_SMOTENC/export


9 — Ensemble híbrido (stacking + soft voting XGB + LGBM + RF)

In [9]:
from sklearn.linear_model import LogisticRegression

EXP_NAME_ENS = f"ENS_{VIEW_TAG}_{BAL_TAG}"
be = DIRS["ens"]
OUT_RESULTS_E = be / "results"
OUT_FIGS_E    = be / "figs"
OUT_PREDS_E   = be / "preds"
OUT_PARAMS_E  = be / "best_params"

OUT_RESULTS_E.mkdir(parents=True, exist_ok=True)
OUT_FIGS_E.mkdir(parents=True, exist_ok=True)
OUT_PREDS_E.mkdir(parents=True, exist_ok=True)
OUT_PARAMS_E.mkdir(parents=True, exist_ok=True)

# --- Probabilidades XGB y LGBM (TUNED) ---
proba_val_xgb_tuned  = xgb_tuned_adapter.predict_proba(X_val_fit)[:, 1]
proba_test_xgb_tuned = xgb_tuned_adapter.predict_proba(X_test_fit)[:, 1]

proba_val_lgb_tuned  = lgbm_tuned_adapter.predict_proba(X_val_fit)[:, 1]
proba_test_lgb_tuned = lgbm_tuned_adapter.predict_proba(X_test_fit)[:, 1]

# --- Cargar preds de Random Forest REDUCED SMOTENC ---
EXP_NAME_RF = f"RF_{VIEW_TAG}_{BAL_TAG}"
br = DIRS["rf"]
rf_preds_val_path  = br / "preds" / f"preds_val_{EXP_NAME_RF}_TUNED.parquet"
rf_preds_test_path = br / "preds" / f"preds_test_{EXP_NAME_RF}_TUNED.parquet"

df_rf_val  = pd.read_parquet(rf_preds_val_path)
df_rf_test = pd.read_parquet(rf_preds_test_path)

proba_val_rf  = df_rf_val["proba"].to_numpy()
proba_test_rf = df_rf_test["proba"].to_numpy()

assert len(proba_val_rf)  == len(y_val),  "[RF] Longitud de preds val != y_val"
assert len(proba_test_rf) == len(y_test), "[RF] Longitud de preds test != y_test"

# --- SOFT VOTING XGB + LGBM + RF ---
proba_val_soft = (proba_val_xgb_tuned + proba_val_lgb_tuned + proba_val_rf) / 3.0
thr_val_soft, f1_val_soft = find_best_threshold(y_val, proba_val_soft, metric="f1")
metrics_val_soft  = compute_all_metrics(y_val,  proba_val_soft,  thr_val_soft)

proba_test_soft = (proba_test_xgb_tuned + proba_test_lgb_tuned + proba_test_rf) / 3.0
metrics_test_soft = compute_all_metrics(y_test, proba_test_soft, thr_val_soft)
y_pred_test_soft  = (proba_test_soft >= thr_val_soft).astype(int)

# --- STACKING ---
Z_val  = np.column_stack([proba_val_xgb_tuned,  proba_val_lgb_tuned,  proba_val_rf])
Z_test = np.column_stack([proba_test_xgb_tuned, proba_test_lgb_tuned, proba_test_rf])

stack_clf = LogisticRegression(
    penalty="l2",
    C=1.0,
    solver="liblinear",
    random_state=RANDOM_STATE
)
stack_clf.fit(Z_val, y_val)

proba_val_stack = stack_clf.predict_proba(Z_val)[:, 1]
thr_val_stack, f1_val_stack = find_best_threshold(y_val, proba_val_stack, metric="f1")
metrics_val_stack  = compute_all_metrics(y_val,  proba_val_stack,  thr_val_stack)

proba_test_stack = stack_clf.predict_proba(Z_test)[:, 1]
metrics_test_stack = compute_all_metrics(y_test, proba_test_stack, thr_val_stack)
y_pred_test_stack  = (proba_test_stack >= thr_val_stack).astype(int)

# --- Plots ---
plot_pr_curve(y_val,  proba_val_soft,
              f"{EXP_NAME_ENS}_SOFT — PR (val)",
              OUT_FIGS_E / f"{EXP_NAME_ENS}_SOFT_pr_val.png")
plot_pr_curve(y_test, proba_test_soft,
              f"{EXP_NAME_ENS}_SOFT — PR (test)",
              OUT_FIGS_E / f"{EXP_NAME_ENS}_SOFT_pr_test.png")

plot_pr_curve(y_val,  proba_val_stack,
              f"{EXP_NAME_ENS}_STACK — PR (val)",
              OUT_FIGS_E / f"{EXP_NAME_ENS}_STACK_pr_val.png")
plot_pr_curve(y_test, proba_test_stack,
              f"{EXP_NAME_ENS}_STACK — PR (test)",
              OUT_FIGS_E / f"{EXP_NAME_ENS}_STACK_pr_test.png")

plot_roc_curve(y_val,  proba_val_soft,
               f"{EXP_NAME_ENS}_SOFT — ROC (val)",
               OUT_FIGS_E / f"{EXP_NAME_ENS}_SOFT_roc_val.png")
plot_roc_curve(y_test, proba_test_soft,
               f"{EXP_NAME_ENS}_SOFT — ROC (test)",
               OUT_FIGS_E / f"{EXP_NAME_ENS}_SOFT_roc_test.png")

plot_roc_curve(y_val,  proba_val_stack,
               f"{EXP_NAME_ENS}_STACK — ROC (val)",
               OUT_FIGS_E / f"{EXP_NAME_ENS}_STACK_roc_val.png")
plot_roc_curve(y_test, proba_test_stack,
               f"{EXP_NAME_ENS}_STACK — ROC (test)",
               OUT_FIGS_E / f"{EXP_NAME_ENS}_STACK_roc_test.png")

plot_confusion(y_test, y_pred_test_soft,
               f"{EXP_NAME_ENS}_SOFT — Confusion (test @thr={thr_val_soft:.3f})",
               OUT_FIGS_E / f"{EXP_NAME_ENS}_SOFT_cm_test.png")

plot_confusion(y_test, y_pred_test_stack,
               f"{EXP_NAME_ENS}_STACK — Confusion (test @thr={thr_val_stack:.3f})",
               OUT_FIGS_E / f"{EXP_NAME_ENS}_STACK_cm_test.png")

# --- Resumen en JSON ---
ens_summary = {
    "model": EXP_NAME_ENS,
    "stack": {
        "thr_val": float(thr_val_stack),
        "metrics_val":  _floatify_metrics(metrics_val_stack),
        "metrics_test": _floatify_metrics(metrics_test_stack),
    },
    "soft_voting": {
        "thr_val": float(thr_val_soft),
        "metrics_val":  _floatify_metrics(metrics_val_soft),
        "metrics_test": _floatify_metrics(metrics_test_soft),
    }
}
_json_dump(OUT_RESULTS_E / f"{EXP_NAME_ENS}_metrics.json", ens_summary)

# --- CSV ensembles.csv ---
row_stack = {
    "model": f"{EXP_NAME_ENS}_STACK",
    "thr_val":        thr_val_stack,
    "val_pr_auc":     metrics_val_stack["pr_auc"],
    "val_roc_auc":    metrics_val_stack["roc_auc"],
    "val_precision":  metrics_val_stack["precision"],
    "val_f1":         metrics_val_stack["f1"],
    "val_recall":     metrics_val_stack["recall"],
    "val_bal_acc":    metrics_val_stack["bal_acc"],
    "test_pr_auc":    metrics_test_stack["pr_auc"],
    "test_roc_auc":   metrics_test_stack["roc_auc"],
    "test_precision": metrics_test_stack["precision"],
    "test_f1":        metrics_test_stack["f1"],
    "test_recall":    metrics_test_stack["recall"],
    "test_bal_acc":   metrics_test_stack["bal_acc"],
}

row_soft = {
    "model": f"{EXP_NAME_ENS}_SOFT",
    "thr_val":        thr_val_soft,
    "val_pr_auc":     metrics_val_soft["pr_auc"],
    "val_roc_auc":    metrics_val_soft["roc_auc"],
    "val_precision":  metrics_val_soft["precision"],
    "val_f1":         metrics_val_soft["f1"],
    "val_recall":     metrics_val_soft["recall"],
    "val_bal_acc":    metrics_val_soft["bal_acc"],
    "test_pr_auc":    metrics_test_soft["pr_auc"],
    "test_roc_auc":   metrics_test_soft["roc_auc"],
    "test_precision": metrics_test_soft["precision"],
    "test_f1":        metrics_test_soft["f1"],
    "test_recall":    metrics_test_soft["recall"],
    "test_bal_acc":   metrics_test_soft["bal_acc"],
}

csv_e = OUT_RESULTS_E / "ensembles.csv"
pd.DataFrame([row_stack, row_soft]).to_csv(
    csv_e,
    mode=("a" if csv_e.exists() else "w"),
    index=False,
    header=not csv_e.exists()
)

# --- Parámetros del meta-learner ---
stack_params = {
    "type": "LogisticRegression",
    "penalty": stack_clf.penalty,
    "C": float(stack_clf.C),
    "solver": stack_clf.solver,
    "feature_names_level1": ["xgb_tuned_proba", "lgbm_tuned_proba", "rf_proba"],
    "coef_": stack_clf.coef_[0].tolist(),
    "intercept_": float(stack_clf.intercept_[0]),
}
_json_dump(OUT_PARAMS_E / f"BEST_{EXP_NAME_ENS}.json", stack_params)

# --- Guardar preds ---
pd.DataFrame({
    "p_xgb":   proba_val_xgb_tuned,
    "p_lgbm":  proba_val_lgb_tuned,
    "p_rf":    proba_val_rf,
    "p_soft":  proba_val_soft,
    "p_stack": proba_val_stack,
    "y_true":  y_val,
}).to_parquet(OUT_PREDS_E / f"preds_val_{EXP_NAME_ENS}.parquet", index=False)

pd.DataFrame({
    "p_xgb":   proba_test_xgb_tuned,
    "p_lgbm":  proba_test_lgb_tuned,
    "p_rf":    proba_test_rf,
    "p_soft":  proba_test_soft,
    "p_stack": proba_test_stack,
    "y_true":  y_test,
}).to_parquet(OUT_PREDS_E / f"preds_test_{EXP_NAME_ENS}.parquet", index=False)

print("\n=== SOFT VOTING (val) ===")
print(metrics_val_soft)
print("\n=== SOFT VOTING (test) ===")
print(metrics_test_soft)

print("\n=== STACKING (val) ===")
print(metrics_val_stack)
print("\n=== STACKING (test) ===")
print(metrics_test_stack)

print("\n[OK][ENSEMBLE] Métricas, preds y pesos del modelo híbrido (XGB+LGBM+RF) guardados en:")
print("  RESULTS:", OUT_RESULTS_E)
print("  PREDS:  ", OUT_PREDS_E)
print("  PARAMS: ", OUT_PARAMS_E)


=== SOFT VOTING (val) ===
{'pr_auc': 0.6932527718875239, 'roc_auc': 0.8630433206704395, 'precision': 0.6631578947368421, 'f1': 0.6404066073697586, 'recall': 0.6191646191646192, 'bal_acc': 0.7694065405929813}

=== SOFT VOTING (test) ===
{'pr_auc': 0.7063962571891518, 'roc_auc': 0.8637913722659485, 'precision': 0.6349614395886889, 'f1': 0.6206030150753769, 'recall': 0.6068796068796068, 'bal_acc': 0.7588698097172673}

=== STACKING (val) ===
{'pr_auc': 0.6967103094287437, 'roc_auc': 0.8628304729999646, 'precision': 0.6929577464788732, 'f1': 0.6456692913385826, 'recall': 0.6044226044226044, 'bal_acc': 0.7679991239313273}

=== STACKING (test) ===
{'pr_auc': 0.7086679281606992, 'roc_auc': 0.8642263218534405, 'precision': 0.6539509536784741, 'f1': 0.6201550387596899, 'recall': 0.5896805896805897, 'bal_acc': 0.7549783990461956}

[OK][ENSEMBLE] Métricas, preds y pesos del modelo híbrido (XGB+LGBM+RF) guardados en:
  RESULTS: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/ENS_REDUCED_S

10 - Stacking con OOF (CV=5)

In [10]:
DO_FULL_STACKING_OOF = True 

def _find_one(pattern, base=None):
    
    base = Path(base or ARTIF_ROOT)
    matches = list(base.rglob(pattern))
    if not matches:
        return None
    if len(matches) > 1:
        print(f"[WARN][_find_one] múltiples matches para patrón '{pattern}':")
        for m in matches:
            print("   -", m)
    return matches[0]

if DO_FULL_STACKING_OOF:
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

    def oof_preds_tree(model_builder, X, y, name):
        oof = np.zeros(len(y), dtype=float)
        models = []
        for fold, (tr, va) in enumerate(skf.split(X, y), 1):
            X_tr, y_tr = X[tr], y[tr]
            X_va, y_va = X[va], y[va]
            if USE_BALANCED_TRAIN:
                X_tr, y_tr = maybe_smote(X_tr, y_tr, keep_idx=keep_idx_global)
            mdl = model_builder()
            if name == "xgb":
                mdl = xgb_fit_with_es(
                    mdl,
                    X_tr, y_tr,
                    X_va, y_va,
                    feature_names=feature_names_used,
                    rounds=200,
                    verbose=False
                )
            else:
                mdl = lgbm_fit_with_es(
                    mdl,
                    X_tr, y_tr,
                    X_va, y_va,
                    feature_names=feature_names_used,
                    rounds=200,
                    verbose=False
                )
            oof[va] = mdl.predict_proba(X_va)[:, 1]
            models.append(mdl)
        return oof, models

    # --- Builders de los modelos base ---
    xgb_hp = xgb_best_params if ('xgb_best_params' in locals() and xgb_best_params is not None) else xgb_params_seed
    def build_xgb():
        return XGBClassifier(
            **{
                **xgb_hp,
                "n_jobs": -1,
                "eval_metric": "aucpr",
                "tree_method": "hist",
                "verbosity": 0,
            }
        )

    lgbm_hp = lgbm_best_params if ('lgbm_best_params' in locals() and lgbm_best_params is not None) else lgbm_params_seed
    def build_lgb():
        return LGBMClassifier(
            **{
                **lgbm_hp,
                "metric": "average_precision",
            }
        )

    # --- OOF XGB / LGBM sobre TRAIN+VAL (L1) ---
    X_all = np.vstack([X_train_fit, X_val_fit])
    y_all = np.concatenate([y_train, y_val])

    oof_xgb, xgb_models = oof_preds_tree(build_xgb, X_all, y_all, "xgb")
    oof_lgb, lgb_models = oof_preds_tree(build_lgb, X_all, y_all, "lgbm")

    # --- OOF RF ---
    rf_oof_fp = _find_one("oof_RF_*_TUNED_CV5.parquet") or _find_one("oof_RF_*_CV5.parquet")
    if rf_oof_fp is None:
        raise FileNotFoundError("[RF] No se encontró oof del RF (oof_RF_*_CV5.parquet).")

    df_oof_rf = pd.read_parquet(rf_oof_fp)
    print("[RF OOF] Columnas encontradas en el parquet:", df_oof_rf.columns)

    candidate_cols = [c for c in df_oof_rf.columns if c.lower() not in ("y_true", "y", "target", "label")]
    if len(candidate_cols) != 1:
        raise ValueError(f"[RF] No puedo identificar de forma única la columna de probas. Columnas: {df_oof_rf.columns}")
    proba_col_rf = candidate_cols[0]
    print(f"[RF OOF] Usando columna '{proba_col_rf}' como probabilidad.")

    oof_rf = df_oof_rf[proba_col_rf].to_numpy()

    # --- Alinear longitudes ---
    n_rf    = len(oof_rf)
    n_all   = len(y_all)
    n_train = len(y_train)

    if n_rf == n_all:
        oof_xgb_meta = oof_xgb
        oof_lgb_meta = oof_lgb
        y_meta = y_all
        print("[STACK OOF] Usando TRAIN+VAL para meta-learner (len =", n_rf, ").")
    elif n_rf == n_train:
        oof_xgb_meta = oof_xgb[:n_rf]
        oof_lgb_meta = oof_lgb[:n_rf]
        y_meta = y_train
        print("[STACK OOF] Usando SOLO TRAIN para meta-learner (len =", n_rf, ").")
    else:
        raise ValueError(
            f"[RF] Longitud OOF RF {n_rf} no coincide ni con TRAIN ({n_train}) ni con TRAIN+VAL ({n_all})."
        )

    X_meta_oof = np.column_stack([oof_xgb_meta, oof_lgb_meta, oof_rf])
    meta_oof = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        C=1.0,
        solver="liblinear"
    )

    best_c, best_ap = None, -1.0
    for c in [0.01, 0.1, 1.0, 3.0, 10.0]:
        tmp = LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            C=c,
            solver="liblinear"
        )
        tmp.fit(X_meta_oof, y_meta)
        ap = average_precision_score(y_meta, tmp.predict_proba(X_meta_oof)[:, 1])
        if ap > best_ap:
            best_ap, best_c = ap, c

    meta_oof.set_params(C=best_c)
    meta_oof.fit(X_meta_oof, y_meta)
    print(f"[STACK OOF] Mejor C para meta-learner: {best_c:.4f} | AP OOF: {best_ap:.5f}")

    def fit_full_and_pred(models_builder, name):
        mdl = models_builder()
        if name == "xgb":
            mdl = xgb_fit_with_es(
                mdl,
                X_all, y_all,
                X_val_fit, y_val,
                feature_names=feature_names_used,
                rounds=200,
                verbose=False
            )
        else:
            mdl = lgbm_fit_with_es(
                mdl,
                X_all, y_all,
                X_val_fit, y_val,
                feature_names=feature_names_used,
                rounds=200,
                verbose=False
            )
        return mdl.predict_proba(X_test_fit)[:, 1]

    ptest_xgb_full = fit_full_and_pred(build_xgb, "xgb")
    ptest_lgb_full = fit_full_and_pred(build_lgb, "lgbm")

    # --- RF TEST ---
    if proba_test_rf is None:
        raise FileNotFoundError("[RF] Falta preds TEST del RF para completar stacking canónico.")

    X_meta_test = np.column_stack([ptest_xgb_full, ptest_lgb_full, proba_test_rf])

    X_meta_val = np.column_stack([proba_val_xgb_tuned, proba_val_lgb_tuned, proba_val_rf])
    p_val_stack_oof = meta_oof.predict_proba(X_meta_val)[:, 1]

    thr_stack_oof, _ = find_best_threshold(y_val, p_val_stack_oof, metric="f1")
    metrics_val_stack_oof = compute_all_metrics(y_val, p_val_stack_oof, thr_stack_oof)

    # --- PREDICCIONES STACK EN TEST ---
    p_test_stack_oof = meta_oof.predict_proba(X_meta_test)[:, 1]
    metrics_test_stack_oof = compute_all_metrics(y_test, p_test_stack_oof, thr_stack_oof)

    # --- GUARDAR PREDS VAL Y TEST ---
    pd.DataFrame({"proba": p_val_stack_oof, "y_true": y_val}).to_parquet(
        OUT_PREDS_E / "preds_val_ENS_STACK_OOF.parquet",
        index=False
    )

    pd.DataFrame({"proba": p_test_stack_oof, "y_true": y_test}).to_parquet(
        OUT_PREDS_E / "preds_test_ENS_STACK_OOF.parquet",
        index=False
    )

    with open(OUT_RESULTS_E / "ensemble_stack_oof_summary.json", "w") as f:
        json.dump(
            {
                "thr": float(thr_stack_oof),
                "val": {k: float(v) for k, v in metrics_val_stack_oof.items()},
                "test": {k: float(v) for k, v in metrics_test_stack_oof.items()},
            },
            f,
            indent=2,
        )

    print("[STACK OOF] OK — resultados de VAL y TEST guardados.")

[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[BAL] SMOTE aplicado | k_neighbors=5 | cat_cols=0
[WARN][_find_one] múltiples matches para patrón 'oof_RF_*_TUNED_CV5.parquet':
   - /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/RF_FULL_SMOTENC/preds/oof_RF_FULL_SMOTENC_TUNED_CV5.parquet
   - /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/RF_REDUCED_SMOTENC/preds/oof_RF_REDUCED_SMOTENC_TUNED_CV5.parquet
[RF OOF] Columnas encontradas en el parquet: Index(['oof_proba', 'y_true'], dtype='object')
[RF OOF] Usando columna 'oof_proba' como probabilidad.
[STACK OOF] Usando SOLO T