1 — Imports, configuración y rutas

In [23]:
import json, os, warnings, time, re, glob
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, recall_score, balanced_accuracy_score, confusion_matrix, precision_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif

# Balanceo
try:
    from imblearn.over_sampling import SMOTENC, SMOTE
    _HAS_IMBLEARN = True
except Exception:
    _HAS_IMBLEARN = False

# XGBoost
from xgboost import XGBClassifier

# === Toggles de experimento ===
USE_REDUCED = True                
SELECTION_MODE = "L1"             
USE_BALANCED_TRAIN = True         
BALANCE_IN_CV = True              
RANDOM_STATE = 42
DO_TUNE = True
DO_CV_BASELINE = True
DO_CV_TUNED = True
CV_FOLDS = 5

# Para MI
MI_TOPK = 30

# Hiperparámetros L1 estable
L1_C_GRID  = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]
L1_KFOLDS  = 5
L1_P_KEEP  = 0.8                
L1_GROUP_COHERENCE = False         

# Calcular máscara L1
FORCE_REFIT_L1 = True

# === Nombres y rutas ===
ROOT = Path.cwd().parent
EXP_NAME = f"XGB_{'REDUCED' if (USE_REDUCED and SELECTION_MODE!='NONE') else 'FULL'}_{'SMOTENC' if USE_BALANCED_TRAIN else 'IMB'}"
ARTIF_DIR = ROOT / "artifacts" / EXP_NAME
OUT_RESULTS = ARTIF_DIR / "results"
OUT_FIGS    = ARTIF_DIR / "figs"
OUT_PREDS   = ARTIF_DIR / "preds"
OUT_PARAMS  = ARTIF_DIR / "best_params"
for p in [OUT_RESULTS, OUT_FIGS, OUT_PREDS, OUT_PARAMS]:
    p.mkdir(parents=True, exist_ok=True)

# Carpeta para artefactos de selección de features
SEL_DIR = OUT_PARAMS / "feature_selection"
SEL_DIR.mkdir(parents=True, exist_ok=True)

# Dataset preprocesado
DATA_DIR = ROOT / "preproc_datasets" / "full"

print("Exp:", EXP_NAME)
print("DATA_DIR:", DATA_DIR)
print("ARTIF_DIR:", ARTIF_DIR)

Exp: XGB_REDUCED_SMOTENC
DATA_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/preproc_datasets/full
ARTIF_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/XGB_REDUCED_SMOTENC


2 — Carga de artefactos (X, y, features)

In [24]:
def load_xy_full(dir_full: Path):
    X_train = np.load(dir_full / "X_train_full.npy")
    X_val   = np.load(dir_full / "X_val_full.npy")
    X_test  = np.load(dir_full / "X_test_full.npy")

    y_train = pd.read_parquet(dir_full / "y_train.parquet")["Exited"].to_numpy()
    y_val   = pd.read_parquet(dir_full / "y_val.parquet")["Exited"].to_numpy()
    y_test  = pd.read_parquet(dir_full / "y_test.parquet")["Exited"].to_numpy()

    feat = pd.read_parquet(dir_full / "feature_names_full.parquet")["feature"].tolist()
    return X_train, y_train, X_val, y_val, X_test, y_test, feat

X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_xy_full(DATA_DIR)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y train/val/test:", y_train.shape, y_val.shape, y_test.shape)
print("n features:", len(feature_names))

Shapes: (6000, 15) (2000, 15) (2000, 15)
y train/val/test: (6000,) (2000,) (2000,)
n features: 15


3 — Métricas, threshold y plots

In [25]:
def pr_auc(y_true, y_proba): 
    return float(average_precision_score(y_true, y_proba))

def roc_auc(y_true, y_proba): 
    return float(roc_auc_score(y_true, y_proba))

def find_best_threshold(y_true, y_proba, metric="f1"):
    thr_grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for thr in thr_grid:
        y_pred = (y_proba >= thr).astype(int)
        if metric == "f1":
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == "recall":
            score = recall_score(y_true, y_pred, zero_division=0)
        else:
            raise ValueError("metric no soportada")
        if score > best_score:
            best_score, best_thr = score, thr
    return float(best_thr), float(best_score)

def compute_all_metrics(y_true, y_proba, thr):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "pr_auc": pr_auc(y_true, y_proba),
        "roc_auc": roc_auc(y_true, y_proba),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "bal_acc": balanced_accuracy_score(y_true, y_pred)
    }

4 — Helpers MI Top-K y balanceo in-memory

In [26]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

def assert_X_names_aligned(X, names, context=""):
    if X.shape[1] != len(names):
        raise ValueError(
            f"[{context}] Desalineado: X.shape[1]={X.shape[1]} vs len(feature_names)={len(names)}. "
            f"Asegura pasar los nombres que correspondan a las columnas actuales de X."
        )

class _BoosterAdapter:
    def __init__(self, booster, params, best_iteration, feature_names=None):
        self._booster = booster
        self._params = dict(params)
        self.best_iteration = best_iteration
        self._feature_names = feature_names

    def predict_proba(self, X):
        d = xgb.DMatrix(X, feature_names=self._feature_names)
        if self.best_iteration is not None:
            pred = self._booster.predict(d, iteration_range=(0, int(self.best_iteration) + 1))
        else:
            pred = self._booster.predict(d)
        return np.column_stack([1.0 - pred, pred])

    def get_booster(self):
        return self._booster

    def get_params(self, deep=True):
        return dict(self._params)

def xgb_fit_with_es(
    sk_model, X_tr, y_tr, X_va, y_va,
    feature_names=None, rounds=200, verbose=False
):
    p = sk_model.get_params()
    n_estimators = p.pop("n_estimators", 1000)
    n_estimators = 1000 if n_estimators is None else int(n_estimators)

    seed = p.pop("random_state", p.pop("seed", 42))
    nthread = p.pop("n_jobs", None)
    if nthread is not None:
        p["nthread"] = nthread

    p.setdefault("seed", seed)
    p.setdefault("objective", "binary:logistic")
    p.setdefault("eval_metric", "aucpr")

    dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=feature_names)
    dvalid = xgb.DMatrix(X_va, label=y_va, feature_names=feature_names)

    evals = [(dtrain, "train"), (dvalid, "valid")]
    booster = xgb.train(
        params=p,
        dtrain=dtrain,
        num_boost_round=n_estimators,
        evals=evals,
        early_stopping_rounds=rounds,
        verbose_eval=verbose
    )

    best_iter = getattr(booster, "best_iteration", None)
    adapter = _BoosterAdapter(
        booster=booster,
        params={**sk_model.get_params(), "best_iteration": best_iter},
        best_iteration=best_iter,
        feature_names=feature_names
    )
    return adapter, best_iter


def apply_keep_idx(X, keep_idx):
    return X[:, np.array(keep_idx, dtype=int)]

def _group_from_name(feat_name: str) -> str:
    if feat_name.startswith("num__"):
        return feat_name
    if "_" in feat_name:
        return feat_name.rsplit("_", 1)[0]
    return feat_name

def _groups_indices(feature_names):
    groups = {}
    for i, f in enumerate(feature_names):
        g = _group_from_name(f)
        groups.setdefault(g, []).append(i)
    return groups

def fit_l1_selector(
    X, y, feature_names, C_grid=L1_C_GRID, kfolds=L1_KFOLDS,
    p_keep=L1_P_KEEP, group_coherence=L1_GROUP_COHERENCE, seed=42
):
    assert_X_names_aligned(X, feature_names, context="fit_l1_selector(INPUT)")

    kf = StratifiedKFold(n_splits=kfolds, shuffle=True, random_state=seed)
    n = X.shape[1]
    select_counts = np.zeros(n, dtype=int)
    rows = []

    for f, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        best_c, best_ap, best_coef = None, -1.0, None
        for C in C_grid:
            mdl = LogisticRegression(
                penalty="l1", solver="liblinear", class_weight="balanced",
                max_iter=5000, C=C, random_state=seed
            )
            mdl.fit(X_tr, y_tr)
            proba = mdl.predict_proba(X_va)[:, 1]
            ap = average_precision_score(y_va, proba)
            if ap > best_ap:
                best_ap, best_c, best_coef = ap, C, mdl.coef_.ravel()

        mask = (np.abs(best_coef) > 1e-12)
        select_counts += mask.astype(int)
        rows.append({"fold": f, "best_C": best_c, "ap_val": best_ap, "n_selected": int(mask.sum())})

    thr = int(np.ceil(kfolds * p_keep))
    keep_idx = np.where(select_counts >= thr)[0].tolist()

    if group_coherence and keep_idx:
        groups = _groups_indices(feature_names)
        keep_set = set(keep_idx)
        for g, idxs in groups.items():
            if keep_set.intersection(idxs):
                keep_set.update(idxs)
        keep_idx = sorted(list(keep_set))

    if len(keep_idx) == 0:
        N = max(1, min(10, max(1, n // 3)))
        order = np.argsort(select_counts)[::-1]
        prelim = order[:N].tolist()
        if select_counts[prelim[0]] == 0:
            try:
                mi = mutual_info_classif(X, y, random_state=seed)
                prelim = np.argsort(mi)[::-1][:N].tolist()
            except Exception:
                prelim = list(range(N))
        keep_idx = sorted(prelim)
        if group_coherence and keep_idx:
            groups = _groups_indices(feature_names)
            keep_set = set(keep_idx)
            for g, idxs in groups.items():
                if keep_set.intersection(idxs):
                    keep_set.update(idxs)
            keep_idx = sorted(list(keep_set))

    report = pd.DataFrame({
        "feature": feature_names,
        "selected_in_folds": select_counts,
        "p_keep": select_counts / kfolds
    }).sort_values(["p_keep","feature"], ascending=[False, True])

    folds_log = pd.DataFrame(rows)
    return keep_idx, report, folds_log

def save_selection_artifacts(mode_tag: str, keep_idx: list, feature_names: list, report_df: pd.DataFrame, folds_df: pd.DataFrame):
    np.save(SEL_DIR / f"keep_idx_{mode_tag}.npy", np.array(keep_idx, dtype=int))
    pd.DataFrame({"feature": [feature_names[i] for i in keep_idx]}).to_csv(SEL_DIR / f"kept_features_{mode_tag}.csv", index=False)
    report_df.to_csv(SEL_DIR / f"{mode_tag}_report_features.csv", index=False)
    folds_df.to_csv(SEL_DIR / f"{mode_tag}_cv_log.csv", index=False)

def load_keep_idx_if_exists(mode_tag: str):
    path = SEL_DIR / f"keep_idx_{mode_tag}.npy"
    return np.load(path) if path.exists() else None

def infer_categorical_indices(feat_names):

    return [i for i, n in enumerate(feat_names) if not str(n).startswith("num__")]

def maybe_smotenc(X, y, feat_names):

    if not _HAS_IMBLEARN:
        return X, y
    cat_idx = infer_categorical_indices(feat_names)
    try:
        if len(cat_idx) > 0:
            sampler = SMOTENC(categorical_features=cat_idx, random_state=RANDOM_STATE)
        else:
            sampler = SMOTE(random_state=RANDOM_STATE)
        X_res, y_res = sampler.fit_resample(X, y)
        return X_res, y_res
    except Exception:
        # Fallback robusto
        try:
            sampler = SMOTE(random_state=RANDOM_STATE)
            return sampler.fit_resample(X, y)
        except Exception:
            return X, y

5 — Hiperparámetros persistentes

In [27]:
VIEW_TAG = "REDUCED" if USE_REDUCED else "FULL"
BAL_TAG  = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"
BEST_HP_FILE = OUT_PARAMS / f"BEST_XGB_{VIEW_TAG}_{BAL_TAG}.json"

def get_xgb_defaults(seed=RANDOM_STATE):
    mdl = XGBClassifier(
        random_state=seed,
        n_jobs=-1,
        eval_metric="aucpr",
        tree_method="hist",
        verbosity=0,
    )
    params = mdl.get_params()
    # normalizamos la clave de verbosity
    params.pop("verbose", None)
    params.setdefault("verbosity", 0)
    return params

def load_best_or_default():
    if BEST_HP_FILE.exists():
        try:
            best = json.loads(BEST_HP_FILE.read_text())
            print("[HP] Cargando mejores hiperparámetros previos:", BEST_HP_FILE.name)
            base = get_xgb_defaults()
            base.update(best)
            return base, True
        except Exception as e:
            print("[HP] Aviso: no se pudo leer BEST (uso defaults).", e)
    print("[HP] Usando hiperparámetros DEFAULT de XGB.")
    return get_xgb_defaults(), False

seed_params, loaded_best_flag = load_best_or_default()

[HP] Cargando mejores hiperparámetros previos: BEST_XGB_REDUCED_SMOTENC.json


6 — Entrenamiento BASELINE + umbral

In [28]:
from xgboost import XGBClassifier

seed_params = dict(seed_params)
seed_params.setdefault("random_state", RANDOM_STATE)
seed_params.setdefault("n_jobs", -1)
seed_params.setdefault("eval_metric", "aucpr")
seed_params.setdefault("tree_method", "hist")
seed_params.setdefault("verbosity", 0)
seed_params.pop("verbose", None)
seed_params["n_estimators"] = seed_params.get("n_estimators") or 1000
if seed_params.get("n_estimators") is None:
    seed_params.pop("n_estimators", None)

# --- Selección global ---
keep_idx_global = None
feature_names_used = feature_names
X_train_fit, X_val_fit, X_test_fit = X_train, X_val, X_test

if USE_REDUCED and SELECTION_MODE != "NONE":
    if SELECTION_MODE == "MI":
        keep_idx_global, _mi = fit_mi_selector(X_train, y_train, topk=MI_TOPK, seed=RANDOM_STATE)
        mode_tag = f"MI_top{MI_TOPK}"
        save_selection_artifacts(mode_tag, keep_idx_global, feature_names,
                                 pd.DataFrame({"_":"MI"}), pd.DataFrame({"_":"MI"}))
    elif SELECTION_MODE == "L1":
        mode_tag = "L1"
        prev = None if FORCE_REFIT_L1 else load_keep_idx_if_exists(mode_tag)
        if prev is not None and prev.size > 0:
            keep_idx_global = prev.astype(int).tolist()
            l1_report = pd.DataFrame({"note":["loaded_existing_mask"]})
            l1_folds  = pd.DataFrame({"note":["loaded_existing_mask"]})
        else:
            keep_idx_global, l1_report, l1_folds = fit_l1_selector(
                X_train, y_train, feature_names, seed=RANDOM_STATE
            )
            save_selection_artifacts(mode_tag, keep_idx_global, feature_names, l1_report, l1_folds)
        # Debug útil:
        kept_tmp = [feature_names[i] for i in keep_idx_global]
        dropped_tmp = [feature_names[i] for i in sorted(set(range(len(feature_names))) - set(keep_idx_global))]
        print(f"[SELECCIÓN {SELECTION_MODE}] Kept={len(kept_tmp)} | Dropped={len(dropped_tmp)}")
        print("[Kept]:", kept_tmp)
        print("[Dropped]:", dropped_tmp)
        try:
            print("\n[SELECCIÓN L1] Top por p_keep:")
            print(l1_report.sort_values("p_keep", ascending=False).head(15).to_string(index=False))
        except Exception:
            pass
    else:
        raise ValueError("SELECTION_MODE inválido")

    X_train_fit = apply_keep_idx(X_train, keep_idx_global)
    X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
    X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)
    feature_names_used = [feature_names[i] for i in keep_idx_global]

# Checks de alineación
assert_X_names_aligned(X_train_fit, feature_names_used, "BASELINE(train)")
assert_X_names_aligned(X_val_fit,   feature_names_used, "BASELINE(val)")
assert_X_names_aligned(X_test_fit,  feature_names_used, "BASELINE(test)")

# --- Balanceo con SMOTENC ---
X_train_final, y_train_final = X_train_fit, y_train
if USE_BALANCED_TRAIN:
    X_train_final, y_train_final = maybe_smotenc(X_train_fit, y_train, feature_names_used)

# --- Entrenamiento con early stopping ---
model = XGBClassifier(**seed_params)
model, best_iter = xgb_fit_with_es(
    model,
    X_train_final, y_train_final,
    X_val_fit, y_val,
    feature_names=feature_names_used,
    rounds=200,
    verbose=False
)
print(f"[BASELINE] best_iteration: {best_iter}")

# --- Umbral óptimo por F1 en val ---
proba_val = model.predict_proba(X_val_fit)[:, 1]
thr_val, best_f1_val = find_best_threshold(y_val, proba_val, metric="f1")
print(f"[BASELINE] Mejor umbral (val) por F1: {thr_val:.3f} | F1(val)={best_f1_val:.4f}")

val_metrics = compute_all_metrics(y_val, proba_val, thr_val)
print("[BASELINE] Métricas val:", {k: (round(v,4) if isinstance(v,float) else v) for k,v in val_metrics.items()})

baseline = model
base_best_it = best_iter
tuned_model = None

# DEBUG — Resumen simple por feature
df_tr = pd.DataFrame(X_train, columns=feature_names)
y_ser = pd.Series(y_train, name="Exited")

print(f"[DEBUG] X_train shape: {df_tr.shape}")
for col in df_tr.columns:
    vc = df_tr[col].nunique()
    if vc <= 3 or set(np.unique(df_tr[col])).issubset({0,1}):
        counts = df_tr[col].value_counts(dropna=False).sort_index()
        pos_rate = y_ser[df_tr[col] == 1].mean() if 1 in df_tr[col].unique() else np.nan
        print(f"\n{col}: binaria/low-card (nunique={vc})")
        print(counts.to_string())
        print(f"  -> tasa de Exited cuando {col}=1: {pos_rate:.4f}" if not np.isnan(pos_rate) else "  -> sin 1s")
    else:
        print(f"\n{col}: numérica (nunique={vc})")
        desc = df_tr[col].describe(percentiles=[.05,.25,.5,.75,.95]).round(3)
        print(desc.to_string())

[SELECCIÓN L1] Kept=12 | Dropped=3
[Kept]: ['num__CreditScore', 'num__Age', 'num__Tenure', 'num__Balance', 'num__EstimatedSalary', 'Geography_1', 'Gender_1', 'HasCrCard_1', 'IsActiveMember_1', 'NumOfProducts_1', 'NumOfProducts_2', 'NumOfProducts_3']
[Dropped]: ['Geography_0', 'Geography_2', 'NumOfProducts_0']

[SELECCIÓN L1] Top por p_keep:
             feature  selected_in_folds  p_keep
            Gender_1                  5     1.0
         Geography_1                  5     1.0
         HasCrCard_1                  5     1.0
    IsActiveMember_1                  5     1.0
     NumOfProducts_1                  5     1.0
     NumOfProducts_2                  5     1.0
     NumOfProducts_3                  5     1.0
            num__Age                  5     1.0
        num__Balance                  5     1.0
    num__CreditScore                  5     1.0
num__EstimatedSalary                  5     1.0
         num__Tenure                  5     1.0
         Geography_2             

7 — Optimización incremental (Optuna)

In [29]:
import optuna
from optuna.samplers import TPESampler

tuned_model = None
N_TRIALS = 40
STUDY_NAME = f"XGB_{VIEW_TAG}_{BAL_TAG}_AP"
SAMPLER = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
study = optuna.create_study(direction="maximize", study_name=STUDY_NAME, sampler=SAMPLER)

SEARCH_KEYS = [
    "learning_rate", "n_estimators", "max_depth", "min_child_weight",
    "subsample", "colsample_bytree", "gamma", "reg_alpha", "reg_lambda"
]

def suggest_xgb_params(trial):
    p = {}
    p["learning_rate"]    = trial.suggest_float("learning_rate", 1e-3, 0.3, log=True)
    p["n_estimators"]     = trial.suggest_int("n_estimators", 800, 3000, step=50)
    p["max_depth"]        = trial.suggest_int("max_depth", 3, 10)
    p["min_child_weight"] = trial.suggest_float("min_child_weight", 0.5, 20.0, log=True)
    p["subsample"]        = trial.suggest_float("subsample", 0.6, 1.0)
    p["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.6, 1.0)
    p["gamma"]            = trial.suggest_float("gamma", 1e-8, 5.0, log=True)
    p["reg_alpha"]        = trial.suggest_float("reg_alpha", 1e-6, 10.0, log=True)
    p["reg_lambda"]       = trial.suggest_float("reg_lambda", 1e-6, 10.0, log=True)
    p["random_state"]     = RANDOM_STATE
    p["n_jobs"]           = -1
    p["eval_metric"]      = "aucpr"
    p["tree_method"]      = "hist"
    p["verbosity"]        = 0
    return p

# Warm-start
if BEST_HP_FILE.exists():
    try:
        prev = json.loads(BEST_HP_FILE.read_text())
        warm = {k: prev[k] for k in SEARCH_KEYS if k in prev}
        if warm:
            print("[OPTUNA] Enqueuing previous BEST as a trial seed.")
            study.enqueue_trial(warm)
    except Exception as e:
        print("[OPTUNA] Aviso: no se pudo usar BEST para warm-start:", e)

def objective(trial):
    hp = suggest_xgb_params(trial)
    mdl = XGBClassifier(**{**seed_params, **hp})
    mdl, best_it = xgb_fit_with_es(
        mdl,
        X_train_final, y_train_final,
        X_val_fit, y_val,
        feature_names=feature_names_used,
        rounds=200,
        verbose=False
    )
    proba_val_t = mdl.predict_proba(X_val_fit)[:, 1]
    ap = average_precision_score(y_val, proba_val_t)
    trial.set_user_attr("best_iteration", best_it)
    return ap

print(f"[OPTUNA] Iniciando estudio '{STUDY_NAME}' con {N_TRIALS} pruebas...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

best = study.best_trial
print(f"[OPTUNA] Mejor AP(val): {best.value:.6f}")
print(f"[OPTUNA] Params ganadores:", best.params)
print(f"[OPTUNA] best_iteration (del trial):", best.user_attrs.get("best_iteration"))

best_params = dict(best.params)
best_params.update({
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
    "eval_metric": "aucpr",
    "tree_method": "hist",
    "verbosity": 0
})
with open(BEST_HP_FILE, "w", encoding="utf-8") as f:
    json.dump(best_params, f, indent=2, ensure_ascii=False)
print("[OPTUNA] Guardado BEST en:", BEST_HP_FILE.name)

tuned_model = XGBClassifier(**best_params)
tuned_model, best_it = xgb_fit_with_es(
    tuned_model,
    X_train_final, y_train_final,
    X_val_fit, y_val,
    feature_names=feature_names_used,
    rounds=200,
    verbose=False
)
print("[OPTUNA] Reentreno final completado. best_iteration =", best_it)

[I 2025-12-09 19:38:48,161] A new study created in memory with name: XGB_REDUCED_SMOTENC_AP


[OPTUNA] Enqueuing previous BEST as a trial seed.
[OPTUNA] Iniciando estudio 'XGB_REDUCED_SMOTENC_AP' con 40 pruebas...


[I 2025-12-09 19:38:50,824] Trial 0 finished with value: 0.6925914819343173 and parameters: {'learning_rate': 0.019853113568201724, 'n_estimators': 3000, 'max_depth': 4, 'min_child_weight': 0.7335742765363596, 'subsample': 0.7518096793246529, 'colsample_bytree': 0.6322009411676793, 'gamma': 0.0004329271635598477, 'reg_alpha': 1.4272322165807652e-05, 'reg_lambda': 0.0005801398193153851}. Best is trial 0 with value: 0.6925914819343173.
[I 2025-12-09 19:38:54,080] Trial 1 finished with value: 0.6782713853508637 and parameters: {'learning_rate': 0.008468008575248327, 'n_estimators': 2900, 'max_depth': 8, 'min_child_weight': 4.550475813202184, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 3.200866785899844e-08, 'reg_alpha': 1.156732719914599, 'reg_lambda': 0.016136341713591334}. Best is trial 0 with value: 0.6925914819343173.
[I 2025-12-09 19:38:55,109] Trial 2 finished with value: 0.6680379232385867 and parameters: {'learning_rate': 0.05675206026988748, '

[OPTUNA] Mejor AP(val): 0.693817
[OPTUNA] Params ganadores: {'learning_rate': 0.03306272205342358, 'n_estimators': 1750, 'max_depth': 3, 'min_child_weight': 0.8056681762980753, 'subsample': 0.8493794437375066, 'colsample_bytree': 0.8266138222376377, 'gamma': 1.1473724321013295e-06, 'reg_alpha': 5.6264972100784e-06, 'reg_lambda': 5.631463295646268e-06}
[OPTUNA] best_iteration (del trial): 1081
[OPTUNA] Guardado BEST en: BEST_XGB_REDUCED_SMOTENC.json
[OPTUNA] Reentreno final completado. best_iteration = 1081


8 — Cross-Validation (OOF) para baseline y tuned

In [30]:
def run_oof_cv_xgb(model_params, X, y, feature_names_in, k_folds=CV_FOLDS, seed=RANDOM_STATE, exp_suffix="BASELINE"):
    assert_X_names_aligned(X, feature_names_in, f"CV(INPUT)-{exp_suffix}")

    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    oof_proba = np.zeros_like(y, dtype=float)
    fold_rows = []

    base = dict(model_params)
    base.pop("verbose", None)
    base.setdefault("verbosity", 0)
    base.setdefault("eval_metric", "aucpr")
    base.setdefault("tree_method", "hist")
    base.setdefault("random_state", seed)
    base.setdefault("n_jobs", -1)
    base["n_estimators"] = base.get("n_estimators") or 1000

    for f, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr0, X_va0 = X[tr_idx], X[va_idx]
        y_tr0, y_va0 = y[tr_idx], y[va_idx]

        feat_names_fold = list(feature_names_in)

        # --- Selección por fold ---
        if USE_REDUCED and SELECTION_MODE != "NONE":
            if SELECTION_MODE == "MI":
                keep_idx, _ = fit_mi_selector(X_tr0, y_tr0, topk=MI_TOPK, seed=seed)
            elif SELECTION_MODE == "L1":
                keep_idx, _rep, _log = fit_l1_selector(X_tr0, y_tr0, feat_names_fold, seed=seed)
            else:
                raise ValueError("SELECTION_MODE inválido")

            X_tr0 = apply_keep_idx(X_tr0, keep_idx)
            X_va0 = apply_keep_idx(X_va0, keep_idx)
            feat_names_fold = [feat_names_fold[i] for i in keep_idx]

        # --- Balanceo por fold con SMOTENC ---
        if BALANCE_IN_CV and USE_BALANCED_TRAIN:
            X_tr, y_tr = maybe_smotenc(X_tr0, y_tr0, feat_names_fold)
        else:
            X_tr, y_tr = X_tr0, y_tr0

        # Sanity checks
        assert_X_names_aligned(X_tr, feat_names_fold, f"CV(fold={f})-train")
        assert_X_names_aligned(X_va0, feat_names_fold, f"CV(fold={f})-val")

        mdl = XGBClassifier(**base)
        adapter, best_it = xgb_fit_with_es(
            mdl, X_tr, y_tr, X_va0, y_va0,
            feature_names=feat_names_fold,
            rounds=200, verbose=False
        )

        proba_va = adapter.predict_proba(X_va0)[:, 1]
        oof_proba[va_idx] = proba_va

        fold_rows.append({
            "fold": f,
            "pr_auc": average_precision_score(y_va0, proba_va),
            "roc_auc": roc_auc_score(y_va0, proba_va),
            "best_iteration": best_it if best_it is not None else np.nan
        })

    oof_pr = average_precision_score(y, oof_proba)
    oof_roc = roc_auc_score(y, oof_proba)
    thr_oof, _ = find_best_threshold(y, oof_proba, metric="f1")
    y_oof_pred = (oof_proba >= thr_oof).astype(int)
    oof_f1  = f1_score(y, y_oof_pred, zero_division=0)
    oof_rec = recall_score(y, y_oof_pred, zero_division=0)
    oof_bal = balanced_accuracy_score(y, y_oof_pred)

    cv_tag = f"{EXP_NAME}_{exp_suffix}_CV{CV_FOLDS}"
    cv_csv = OUT_RESULTS / f"cv_summary_{cv_tag}.csv"
    folds_df = pd.DataFrame(fold_rows)
    agg_row = pd.DataFrame([{
        "fold": "OOF", "pr_auc": oof_pr, "roc_auc": oof_roc,
        "thr": thr_oof, "f1": oof_f1, "recall": oof_rec, "bal_acc": oof_bal
    }])
    pd.concat([folds_df, agg_row], ignore_index=True).to_csv(cv_csv, index=False)

    oof_path = OUT_PREDS / f"oof_{cv_tag}.parquet"
    pd.DataFrame({"oof_proba": oof_proba, "y_true": y}).to_parquet(oof_path, index=False)

    print(f"[CV-{exp_suffix}] Guardados: {cv_csv.name} | {oof_path.name}")
    return {"oof_pr_auc": oof_pr, "oof_roc_auc": oof_roc, "thr": thr_oof,
            "oof_f1": oof_f1, "oof_recall": oof_rec, "oof_bal_acc": oof_bal}

cv_baseline = None
cv_tuned = None

if DO_CV_BASELINE:
    cv_baseline = run_oof_cv_xgb(seed_params, X_train_fit, y_train, feature_names_used, exp_suffix="BASELINE")

if DO_CV_TUNED and tuned_model is not None:
    cv_tuned = run_oof_cv_xgb(best_params, X_train_fit, y_train, feature_names_used, exp_suffix="TUNED")

[CV-BASELINE] Guardados: cv_summary_XGB_REDUCED_SMOTENC_BASELINE_CV5.csv | oof_XGB_REDUCED_SMOTENC_BASELINE_CV5.parquet
[CV-TUNED] Guardados: cv_summary_XGB_REDUCED_SMOTENC_TUNED_CV5.csv | oof_XGB_REDUCED_SMOTENC_TUNED_CV5.parquet


9 — Evaluación en test + guardados

In [31]:
def plot_pr_curve(y_true, y_proba, title, out_path):
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.step(rec, prec, where='post')
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title(f'{title} (AP={ap:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_roc_curve(y_true, y_proba, title, out_path):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2)
    plt.plot([0,1],[0,1], 'k--', lw=1)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title(f'{title} (AUC={auc:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_confusion(y_true, y_pred, title, out_path, normalize=False):
    norm = 'true' if normalize else None
    cm = confusion_matrix(y_true, y_pred, normalize=norm)
    plt.figure(figsize=(5,4))
    im = plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    ticks = np.arange(2)
    plt.xticks(ticks, ['0','1']); plt.yticks(ticks, ['0','1'])
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            txt = f'{cm[i,j]:.2f}' if normalize else str(cm[i,j])
            plt.text(j, i, txt, ha='center', va='center',
                     color='white' if cm[i,j] > thresh else 'black')
    plt.ylabel('True label'); plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def xgb_gain_importances(booster, feature_names):
    """Devuelve array de importancias 'gain' alineado a feature_names."""
    gain_dict = booster.get_score(importance_type="gain")
    name_to_idx = {n: i for i, n in enumerate(feature_names)}
    imp_gain = np.zeros(len(feature_names), dtype=float)
    for k, v in gain_dict.items():
        # k puede ser "f12" o el nombre real de la feature
        if k.startswith("f") and k[1:].isdigit():
            idx = int(k[1:])
        else:
            idx = name_to_idx.get(k, None)
        if idx is not None and 0 <= idx < len(imp_gain):
            imp_gain[idx] = v
    return imp_gain

# ——— Evaluación y guardados ———
base = EXP_NAME

# BASELINE
proba_test = model.predict_proba(X_test_fit)[:, 1]
y_pred_test = (proba_test >= thr_val).astype(int)
test_metrics = compute_all_metrics(y_test, proba_test, thr_val)

# Guardar HP baseline (seed y fitted)
params_seed_path = OUT_PARAMS / f"{base}_BASE_seed_params.json"
with open(params_seed_path, "w", encoding="utf-8") as f:
    json.dump(seed_params, f, indent=2, ensure_ascii=False)

params_fitted_path = OUT_PARAMS / f"{base}_BASE_fitted_params.json"
with open(params_fitted_path, "w", encoding="utf-8") as f:
    json.dump(model.get_params(), f, indent=2, ensure_ascii=False)

# Figuras baseline
plot_pr_curve(y_val,  proba_val,  f"{base} — PR (val)",  OUT_FIGS / f"{base}_pr_val.png")
plot_pr_curve(y_test, proba_test, f"{base} — PR (test)", OUT_FIGS / f"{base}_pr_test.png")
plot_roc_curve(y_val,  proba_val,  f"{base} — ROC (val)",  OUT_FIGS / f"{base}_roc_val.png")
plot_roc_curve(y_test, proba_test, f"{base} — ROC (test)", OUT_FIGS / f"{base}_roc_test.png")
plot_confusion(y_test, y_pred_test, f"{base} — Confusion (test @thr={thr_val:.3f})", OUT_FIGS / f"{base}_cm_test.png")

# Importancias baseline
try:
    booster = model.get_booster()
    imp_gain = xgb_gain_importances(booster, feature_names_used)
except Exception:
    imp_gain = np.zeros(len(feature_names_used))

imp_df = pd.DataFrame({
    "feature": feature_names_used[:len(imp_gain)],
    "importance_gain": imp_gain
}).sort_values("importance_gain", ascending=False)
imp_path = OUT_RESULTS / f"{base}_feature_importances.csv"
imp_df.to_csv(imp_path, index=False)

# Preds test baseline
preds_path = OUT_PREDS / f"preds_test_{base}.parquet"
pd.DataFrame({"proba": proba_test, "y_true": y_test}).to_parquet(preds_path, index=False)

best_iter_base = getattr(model, "best_iteration", getattr(model, "best_ntree_limit", None))
row_base = {
    "model": base,
    "thr_val": thr_val,
    "val_pr_auc": val_metrics["pr_auc"],
    "val_roc_auc": val_metrics["roc_auc"],
    "val_precision": val_metrics["precision"],
    "val_f1": val_metrics["f1"],
    "val_recall": val_metrics["recall"],
    "val_bal_acc": val_metrics["bal_acc"],
    "test_pr_auc": test_metrics["pr_auc"],
    "test_roc_auc": test_metrics["roc_auc"],
    "test_precision": test_metrics["precision"],
    "test_f1": test_metrics["f1"],
    "test_recall": test_metrics["recall"],
    "test_bal_acc": test_metrics["bal_acc"],
    "best_iteration": best_iter_base if best_iter_base is not None else np.nan
}
res_csv = OUT_RESULTS / "baselines.csv"
pd.DataFrame([row_base]).to_csv(res_csv, mode=("a" if res_csv.exists() else "w"),
                                index=False, header=not res_csv.exists())

print("[OK][BASE] Guardados:",
      "\n  - Seed HPs   :", params_seed_path.name,
      "\n  - Fitted HPs :", params_fitted_path.name,
      "\n  - Importancias:", imp_path.name,
      "\n  - Preds test  :", preds_path.name,
      "\n  - Baselines   :", res_csv.name)

# TUNED
if tuned_model is not None:
    proba_val_tuned = tuned_model.predict_proba(X_val_fit)[:, 1]
    thr_val_tuned, _ = find_best_threshold(y_val, proba_val_tuned, metric="f1")
    val_metrics_tuned = compute_all_metrics(y_val, proba_val_tuned, thr_val_tuned)

    proba_test_tuned = tuned_model.predict_proba(X_test_fit)[:, 1]
    y_pred_test_tuned = (proba_test_tuned >= thr_val_tuned).astype(int)
    test_metrics_tuned = compute_all_metrics(y_test, proba_test_tuned, thr_val_tuned)

    tuned_fitted_path = OUT_PARAMS / f"{base}_TUNED_fitted_params.json"
    with open(tuned_fitted_path, "w", encoding="utf-8") as f:
        json.dump(tuned_model.get_params(), f, indent=2, ensure_ascii=False)

    base_t = base + "_TUNED"
    plot_pr_curve(y_val,  proba_val_tuned,  f"{base_t} — PR (val)",  OUT_FIGS / f"{base_t}_pr_val.png")
    plot_pr_curve(y_test, proba_test_tuned, f"{base_t} — PR (test)", OUT_FIGS / f"{base_t}_pr_test.png")
    plot_roc_curve(y_val,  proba_val_tuned,  f"{base_t} — ROC (val)",  OUT_FIGS / f"{base_t}_roc_val.png")
    plot_roc_curve(y_test, proba_test_tuned, f"{base_t} — ROC (test)", OUT_FIGS / f"{base_t}_roc_test.png")
    plot_confusion(y_test, y_pred_test_tuned, f"{base_t} — Confusion (test @thr={thr_val_tuned:.3f})", OUT_FIGS / f"{base_t}_cm_test.png")

    try:
        booster_t = tuned_model.get_booster()
        imp_gain_t = xgb_gain_importances(booster_t, feature_names_used)
    except Exception:
        imp_gain_t = np.zeros(len(feature_names_used))

    imp_t_path = OUT_RESULTS / f"{base_t}_feature_importances.csv"
    pd.DataFrame({
        "feature": feature_names_used[:len(imp_gain_t)],
        "importance_gain": imp_gain_t
    }).sort_values("importance_gain", ascending=False).to_csv(imp_t_path, index=False)
    preds_t_path = OUT_PREDS / f"preds_test_{base_t}.parquet"
    pd.DataFrame({"proba": proba_test_tuned, "y_true": y_test}).to_parquet(preds_t_path, index=False)

    best_iter_tuned = getattr(tuned_model, "best_iteration", getattr(tuned_model, "best_ntree_limit", None))
    row_t = {
        "model": base_t,
        "thr_val": thr_val_tuned,
        "val_pr_auc": val_metrics_tuned["pr_auc"],
        "val_roc_auc": val_metrics_tuned["roc_auc"],
        "val_precision": val_metrics_tuned["precision"],
        "val_f1": val_metrics_tuned["f1"],
        "val_recall": val_metrics_tuned["recall"],
        "val_bal_acc": val_metrics_tuned["bal_acc"],
        "test_pr_auc": test_metrics_tuned["pr_auc"],
        "test_roc_auc": test_metrics_tuned["roc_auc"],
        "test_precision": test_metrics_tuned["precision"],
        "test_f1": test_metrics_tuned["f1"],
        "test_recall": test_metrics_tuned["recall"],
        "test_bal_acc": test_metrics_tuned["bal_acc"],
        "best_iteration": best_iter_tuned
    }
    pd.DataFrame([row_t]).to_csv(res_csv, mode="a", index=False, header=False)

    print("[OK][TUNED] Guardados:",
          "\n  - Fitted HPs :", tuned_fitted_path.name,
          "\n  - Importancias:", imp_t_path.name,
          "\n  - Preds test  :", preds_t_path.name,
          "\n  - Baselines   :", res_csv.name)

[OK][BASE] Guardados: 
  - Seed HPs   : XGB_REDUCED_SMOTENC_BASE_seed_params.json 
  - Fitted HPs : XGB_REDUCED_SMOTENC_BASE_fitted_params.json 
  - Importancias: XGB_REDUCED_SMOTENC_feature_importances.csv 
  - Preds test  : preds_test_XGB_REDUCED_SMOTENC.parquet 
  - Baselines   : baselines.csv
[OK][TUNED] Guardados: 
  - Fitted HPs : XGB_REDUCED_SMOTENC_TUNED_fitted_params.json 
  - Importancias: XGB_REDUCED_SMOTENC_TUNED_feature_importances.csv 
  - Preds test  : preds_test_XGB_REDUCED_SMOTENC_TUNED.parquet 
  - Baselines   : baselines.csv


10 — Mejores resultados + resumen CV

In [32]:
AGGREGATE_ALL_RUNS = False

def safe(v, fmt=".4f"):
    try:
        return f"{float(v):{fmt}}"
    except Exception:
        return "NA"

base_csv = OUT_RESULTS / "baselines.csv"
if not base_csv.exists():
    raise FileNotFoundError(f"No existe {base_csv}")

df = pd.read_csv(base_csv)

needed = [
    "model","thr_val",
    "val_pr_auc","val_roc_auc","val_precision","val_f1","val_recall","val_bal_acc",
    "test_pr_auc","test_roc_auc","test_precision","test_f1","test_recall","test_bal_acc",
    "best_iteration"
]
for c in needed:
    if c not in df.columns:
        df[c] = pd.NA 


df = df[needed].copy()


num_cols = [c for c in needed if c not in ("model",)]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")


if AGGREGATE_ALL_RUNS:
    root_art = ARTIF_DIR.parent
    for p in (root_art).glob("XGB_*/results/baselines.csv"):
        if p == base_csv:
            continue
        try:
            d2 = pd.read_csv(p)
            for c in needed:
                if c not in d2.columns:
                    d2[c] = pd.NA
            d2 = d2[needed]
            for c in num_cols:
                d2[c] = pd.to_numeric(d2[c], errors="coerce")
            df = pd.concat([df, d2], ignore_index=True)
        except Exception:
            pass

if df.empty:
    raise ValueError("El dataframe de resultados está vacío.")

df = df.drop_duplicates(subset=["model"], keep="last").copy()

def best_by(metric):
    if metric not in df.columns or df[metric].dropna().empty:
        return None
    r = df.loc[df[metric].idxmax()]
    print(
        f"- {metric}: {r['model']} | "
        f"PR-AUC={safe(r['test_pr_auc'])} | "
        f"ROC-AUC={safe(r['test_roc_auc'])} | "
        f"F1={safe(r['test_f1'])} | "
        f"Recall={safe(r['test_recall'])} | "
        f"Precision={safe(r['test_precision'])} | "
        f"thr(val)={safe(r['thr_val'], '.3f')} | "
        f"best_iter={int(r['best_iteration']) if pd.notna(r['best_iteration']) else 'NA'}"
    )
    return r

print("=== MEJORES EN TEST (por métrica) ===")
winners = {}
for m in ["test_pr_auc","test_roc_auc","test_recall","test_f1","test_precision"]:
    w = best_by(m)
    if w is not None:
        winners[m] = w


cv_files = list(OUT_RESULTS.glob("cv_summary_*_CV*.csv"))
if cv_files:
    print("=== RESUMEN CV-OOF (por experimento) ===")
    rows = []
    for f in cv_files:
        tag = re.sub(r"^cv_summary_|\.csv$", "", f.name)
        cv = pd.read_csv(f)
        oof = cv.loc[cv["fold"] == "OOF"]
        if not oof.empty:
            r = oof.iloc[0]
            rows.append({
                "tag": tag,
                "pr_auc": r.get("pr_auc"),
                "roc_auc": r.get("roc_auc"),
                "f1": r.get("f1"),
                "recall": r.get("recall"),
                "bal_acc": r.get("bal_acc"),
                "thr": r.get("thr"),
            })
    if rows:
        print(pd.DataFrame(rows).sort_values(["pr_auc","roc_auc"], ascending=False).to_string(index=False))
else:
    print("(No se hallaron archivos de CV para este experimento)")

SOTA_XGB = {
    "AUC": 0.8512,        
    "Recall": None,       
    "Precision": None,    
    "source": "Shukla (2021), ICSCC — Kaggle Bank Churn (10k)"
}

if "test_roc_auc" in winners and winners["test_roc_auc"] is not None:
    bt = winners["test_roc_auc"]
    d_auc = float(bt["test_roc_auc"]) - SOTA_XGB["AUC"]
    print("=== COMPARACIÓN SOTA XGBOOST vs. MEJOR TEST ===")
    print(f"Paper XGBoost: AUC={SOTA_XGB['AUC']:.4f} | Recall={SOTA_XGB['Recall'] or 'N/R'} | Precision={SOTA_XGB['Precision'] or 'N/R'}")
    print(f"Tu mejor   : AUC={safe(bt['test_roc_auc'])} | Recall={safe(bt['test_recall'])} | Precision={safe(bt['test_precision'])}")
    print(f"Deltas     : ΔAUC={d_auc:+.4f}")
    print(f"Fuente SOTA: {SOTA_XGB['source']}")
else:
    print("No se pudo localizar el ganador por ROC-AUC para comparar contra SOTA.")

backup = OUT_RESULTS / "baselines_legacy_backup.csv"
base_csv.replace(backup)
df.to_csv(base_csv, index=False)
print("[OK] Normalizado. Backup:", backup.name)

=== MEJORES EN TEST (por métrica) ===
- test_pr_auc: XGB_REDUCED_SMOTENC_TUNED | PR-AUC=0.7090 | ROC-AUC=0.8597 | F1=0.6111 | Recall=0.6216 | Precision=0.6010 | thr(val)=0.481 | best_iter=1081
- test_roc_auc: XGB_REDUCED_SMOTENC | PR-AUC=0.7069 | ROC-AUC=0.8604 | F1=0.6145 | Recall=0.6462 | Precision=0.5857 | thr(val)=0.452 | best_iter=1182
- test_recall: XGB_REDUCED_SMOTENC | PR-AUC=0.7069 | ROC-AUC=0.8604 | F1=0.6145 | Recall=0.6462 | Precision=0.5857 | thr(val)=0.452 | best_iter=1182
- test_f1: XGB_REDUCED_SMOTENC | PR-AUC=0.7069 | ROC-AUC=0.8604 | F1=0.6145 | Recall=0.6462 | Precision=0.5857 | thr(val)=0.452 | best_iter=1182
- test_precision: XGB_REDUCED_SMOTENC_TUNED | PR-AUC=0.7090 | ROC-AUC=0.8597 | F1=0.6111 | Recall=0.6216 | Precision=0.6010 | thr(val)=0.481 | best_iter=1081
=== RESUMEN CV-OOF (por experimento) ===
                             tag   pr_auc  roc_auc       f1   recall  bal_acc   thr
   XGB_REDUCED_SMOTENC_TUNED_CV5 0.686903 0.857152 0.614623 0.659853 0.767544 0.

11 - Mejores resultados + resumen CV

In [33]:
print("\n=== REPORTE FINAL DE SELECCIÓN DE FEATURES ===")
if not USE_REDUCED or SELECTION_MODE == "NONE":
    print("Selección: DESACTIVADA (se usaron todas las features).")
else:
    mode_tag = ("MI_top" + str(MI_TOPK)) if SELECTION_MODE=="MI" else "L1"
    mask_path = SEL_DIR / f"keep_idx_{mode_tag}.npy"
    if not mask_path.exists():
        print(f"Aviso: no se encontró máscara en {mask_path}. ¿Ejecutaste la sección de selección?")
    else:
        keep_idx = np.load(mask_path).astype(int).tolist()
        kept_features = [feature_names[i] for i in keep_idx]
        dropped_idx = sorted(set(range(len(feature_names))) - set(keep_idx))
        dropped_features = [feature_names[i] for i in dropped_idx]

        # Guardados informativos
        pd.DataFrame({"feature": kept_features}).to_csv(SEL_DIR / f"final_kept_{mode_tag}.csv", index=False)
        pd.DataFrame({"feature": dropped_features}).to_csv(SEL_DIR / f"final_dropped_{mode_tag}.csv", index=False)

        print(f"[{mode_tag}] Total features originales: {len(feature_names)}")
        print(f"[{mode_tag}] Features conservadas: {len(kept_features)}")
        print(f"[{mode_tag}] Features eliminadas: {len(dropped_features)}")
        print("\nConservadas:")
        print(kept_features)
        print("\nEliminadas:")
        print(dropped_features)


=== REPORTE FINAL DE SELECCIÓN DE FEATURES ===
[L1] Total features originales: 15
[L1] Features conservadas: 12
[L1] Features eliminadas: 3

Conservadas:
['num__CreditScore', 'num__Age', 'num__Tenure', 'num__Balance', 'num__EstimatedSalary', 'Geography_1', 'Gender_1', 'HasCrCard_1', 'IsActiveMember_1', 'NumOfProducts_1', 'NumOfProducts_2', 'NumOfProducts_3']

Eliminadas:
['Geography_0', 'Geography_2', 'NumOfProducts_0']
