1 — Imports, configuración y rutas

In [18]:
import json, os, warnings, time, re, glob
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, recall_score, balanced_accuracy_score, confusion_matrix, precision_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

# Balanceo (SMOTENC)
try:
    from imblearn.over_sampling import SMOTENC
    _HAS_IMBLEARN = True
except Exception:
    _HAS_IMBLEARN = False

# === Toggles de experimento ===
USE_REDUCED = False              
USE_BALANCED_TRAIN = True        
BALANCE_IN_CV = True             
RANDOM_STATE = 42
DO_TUNE = True
DO_CV_BASELINE = True
DO_CV_TUNED = True
CV_FOLDS = 5
MI_TOPK = 30                     

# === Nombres y rutas ===
ROOT = Path.cwd().parent
EXP_NAME = f"RF_{'REDUCED' if USE_REDUCED else 'FULL'}_{'SMOTENC' if USE_BALANCED_TRAIN else 'IMB'}"
ARTIF_DIR = ROOT / "artifacts" / EXP_NAME
OUT_RESULTS = ARTIF_DIR / "results"
OUT_FIGS    = ARTIF_DIR / "figs"
OUT_PREDS   = ARTIF_DIR / "preds"
OUT_PARAMS  = ARTIF_DIR / "best_params"
for p in [OUT_RESULTS, OUT_FIGS, OUT_PREDS, OUT_PARAMS]:
    p.mkdir(parents=True, exist_ok=True)

# Dataset preprocesado
DATA_DIR = ROOT / "preproc_datasets" / "full"

print("Exp:", EXP_NAME)
print("DATA_DIR:", DATA_DIR)
print("ARTIF_DIR:", ARTIF_DIR)

Exp: RF_FULL_SMOTENC
DATA_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/preproc_datasets/full
ARTIF_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/RF_FULL_SMOTENC


2 — Carga de artefactos (X, y, features)

In [19]:
def load_xy_full(dir_full: Path):
    X_train = np.load(dir_full / "X_train_full.npy")
    X_val   = np.load(dir_full / "X_val_full.npy")
    X_test  = np.load(dir_full / "X_test_full.npy")

    y_train = pd.read_parquet(dir_full / "y_train.parquet")["Exited"].to_numpy()
    y_val   = pd.read_parquet(dir_full / "y_val.parquet")["Exited"].to_numpy()
    y_test  = pd.read_parquet(dir_full / "y_test.parquet")["Exited"].to_numpy()

    feat = pd.read_parquet(dir_full / "feature_names_full.parquet")["feature"].tolist()
    return X_train, y_train, X_val, y_val, X_test, y_test, feat

X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_xy_full(DATA_DIR)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y train/val/test:", y_train.shape, y_val.shape, y_test.shape)
print("n features:", len(feature_names))

Shapes: (6000, 15) (2000, 15) (2000, 15)
y train/val/test: (6000,) (2000,) (2000,)
n features: 15


3 — Métricas, threshold y utilidades

In [20]:
def pr_auc(y_true, y_proba): 
    return float(average_precision_score(y_true, y_proba))

def roc_auc(y_true, y_proba): 
    return float(roc_auc_score(y_true, y_proba))

def find_best_threshold(y_true, y_proba, metric="f1"):
    thr_grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for thr in thr_grid:
        y_pred = (y_proba >= thr).astype(int)
        if metric == "f1":
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == "recall":
            score = recall_score(y_true, y_pred, zero_division=0)
        else:
            raise ValueError("metric no soportada")
        if score > best_score:
            best_score, best_thr = score, thr
    return float(best_thr), float(best_score)

def compute_all_metrics(y_true, y_proba, thr):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "pr_auc": pr_auc(y_true, y_proba),
        "roc_auc": roc_auc(y_true, y_proba),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "bal_acc": balanced_accuracy_score(y_true, y_pred)
    }

4 — Helpers: MI Top-K (opcional) y balanceo SMOTE

In [21]:
def fit_mi_selector(X, y, topk=30, seed=RANDOM_STATE):
    mi = mutual_info_classif(X, y, random_state=seed, discrete_features=False)
    idx = np.argsort(mi)[::-1][:topk]
    return idx, mi

def apply_keep_idx(X, keep_idx):
    return X[:, keep_idx]

def _infer_categorical_idx_from_names(feature_names_in):

    cats = []
    for i, n in enumerate(map(str, feature_names_in)):
        nlow = n.lower()
        if nlow.startswith("num__"):
            continue
        if nlow.startswith(("cat__", "ohe__", "bin__")) or not nlow.startswith("num__"):
            cats.append(i)
    return cats

def _load_categorical_names_from_metadata():

    # 1) Parquet
    p_parquet = DATA_DIR / "feature_metadata.parquet"
    if p_parquet.exists():
        try:
            meta = pd.read_parquet(p_parquet)
            cols = {c.lower() for c in meta.columns}
    
            name_col = "feature" if "feature" in cols else ("name" if "name" in cols else None)
            flag_col = "is_categorical" if "is_categorical" in cols else ("categorical" if "categorical" in cols else None)
            if name_col is None or flag_col is None:

                if "dtype" in cols:
                    meta.columns = [c.lower() for c in meta.columns]
                    cat_names = set(meta.loc[meta["dtype"].astype(str).str.contains("cat", case=False), "feature"])
                    return {str(x) for x in cat_names}
                return None
            # normaliza nombres de columnas
            meta.columns = [c.lower() for c in meta.columns]
            cat_names = set(meta.loc[meta[flag_col].astype(bool), "feature"])
            return {str(x) for x in cat_names}
        except Exception:
            pass

    # 2) JSON
    p_json = DATA_DIR / "feature_metadata.json"
    if p_json.exists():
        try:
            obj = json.loads(p_json.read_text())
            if isinstance(obj, dict):
                if "categorical_features" in obj and isinstance(obj["categorical_features"], (list, tuple)):
                    return {str(x) for x in obj["categorical_features"]}
                if "categorical_mask" in obj and isinstance(obj["categorical_mask"], (list, tuple)):
                    mask = list(map(bool, obj["categorical_mask"]))

                    p_full = DATA_DIR / "feature_names_full.parquet"
                    if p_full.exists():
                        full_names = pd.read_parquet(p_full)["feature"].tolist()
                        names = [str(n) for n, m in zip(full_names, mask) if m]
                        return set(names)
        except Exception:
            pass

    return None

def _categorical_idx_for(feature_names_in):
    meta_cats = _load_categorical_names_from_metadata()
    if meta_cats:

        idx = [i for i, n in enumerate(map(str, feature_names_in)) if n in meta_cats]
        if idx:
            return idx
    # Fallback por nombre
    return _infer_categorical_idx_from_names(feature_names_in)

def maybe_smotenc(X, y, feature_names_in, seed=RANDOM_STATE):
    if not _HAS_IMBLEARN or X is None or feature_names_in is None or len(feature_names_in) == 0:
        return X, y
    try:
        cat_idx = _categorical_idx_for(feature_names_in)
        if not cat_idx:

            return X, y
        sm = SMOTENC(categorical_features=cat_idx, random_state=seed)
        Xb, yb = sm.fit_resample(X, y)
        return Xb, yb
    except Exception as e:
        print("[SMOTENC] Aviso: no se pudo aplicar SMOTENC. Se usa train original.", e)
        return X, y

5 — Hiperparámetros persistentes (seed/best)

In [22]:
VIEW_TAG = "REDUCED" if USE_REDUCED else "FULL"
BAL_TAG  = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"
BEST_HP_FILE = OUT_PARAMS / f"BEST_RF_{VIEW_TAG}_{BAL_TAG}.json"

def get_rf_defaults(seed=RANDOM_STATE):
    mdl = RandomForestClassifier(
        random_state=seed,
        n_jobs=-1,
        class_weight=None,
        criterion="gini",
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        bootstrap=True
    )
    return mdl.get_params()

def load_best_or_default():
    if BEST_HP_FILE.exists():
        try:
            best = json.loads(BEST_HP_FILE.read_text())
            print("[HP] Cargando mejores hiperparámetros previos:", BEST_HP_FILE.name)
            base = get_rf_defaults()
            base.update(best)
            return base, True
        except Exception as e:
            print("[HP] Aviso: no se pudo leer BEST (uso defaults).", e)
    print("[HP] Usando hiperparámetros DEFAULT de RF.")
    return get_rf_defaults(), False

seed_params, loaded_best_flag = load_best_or_default()

[HP] Cargando mejores hiperparámetros previos: BEST_RF_FULL_SMOTENC.json


6 — Entrenamiento BASELINE + umbral (val)

In [23]:
seed_params = dict(seed_params)
seed_params.setdefault("random_state", RANDOM_STATE)
seed_params.setdefault("n_jobs", -1)

keep_idx_global = None
feature_names_used = feature_names
X_train_fit, X_val_fit, X_test_fit = X_train, X_val, X_test

if USE_REDUCED:
    keep_idx_global, _mi = fit_mi_selector(X_train, y_train, topk=MI_TOPK, seed=RANDOM_STATE)
    X_train_fit = apply_keep_idx(X_train, keep_idx_global)
    X_val_fit   = apply_keep_idx(X_val,   keep_idx_global)
    X_test_fit  = apply_keep_idx(X_test,  keep_idx_global)
    feature_names_used = [feature_names[i] for i in keep_idx_global]

# === Balanceo con SMOTENC ===
X_train_final, y_train_final = X_train_fit, y_train
if USE_BALANCED_TRAIN:
    X_train_final, y_train_final = maybe_smotenc(X_train_fit, y_train, feature_names_used, seed=RANDOM_STATE)

model = RandomForestClassifier(**seed_params)
model.fit(X_train_final, y_train_final)

proba_val = model.predict_proba(X_val_fit)[:, 1]
thr_val, best_f1_val = find_best_threshold(y_val, proba_val, metric="f1")
print(f"[BASELINE] Mejor umbral (val) por F1: {thr_val:.3f} | F1(val)={best_f1_val:.4f}")

val_metrics = compute_all_metrics(y_val, proba_val, thr_val)
print("[BASELINE] Métricas val:", {k: (round(v,4) if isinstance(v,float) else v) for k,v in val_metrics.items()})

baseline = model
tuned_model = None

[BASELINE] Mejor umbral (val) por F1: 0.538 | F1(val)=0.6357
[BASELINE] Métricas val: {'pr_auc': 0.6768, 'roc_auc': 0.8574, 'precision': 0.6022, 'f1': 0.6357, 'recall': 0.6732, 'bal_acc': 0.7798}


7 — Optimización incremental (Optuna, objetivo = AP/PR-AUC en val)

In [24]:
import optuna
from optuna.samplers import TPESampler

tuned_model = None
N_TRIALS = 40
STUDY_NAME = f"RF_{VIEW_TAG}_{BAL_TAG}_AP"
SAMPLER = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
study = optuna.create_study(direction="maximize", study_name=STUDY_NAME, sampler=SAMPLER)

SEARCH_KEYS = [
    "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf",
    "max_features", "bootstrap", "criterion", "class_weight"
]

def suggest_rf_params(trial):
    p = {}
    p["n_estimators"]     = trial.suggest_int("n_estimators", 200, 2000, step=50)
    p["max_depth"]        = trial.suggest_categorical("max_depth", [None, 6, 8, 12, 16, 20, 30, 40])
    p["min_samples_split"] = trial.suggest_int("min_samples_split", 2, 30)
    p["min_samples_leaf"]  = trial.suggest_int("min_samples_leaf", 1, 20)
    p["max_features"]      = trial.suggest_categorical("max_features", ["sqrt", "log2", 0.5, 0.7, 0.9])
    p["bootstrap"]         = trial.suggest_categorical("bootstrap", [True, False])
    p["criterion"]         = trial.suggest_categorical("criterion", ["gini", "entropy"])  # amplio y compatible
    p["class_weight"]      = trial.suggest_categorical("class_weight", [None, "balanced", "balanced_subsample"])
    p["random_state"]      = RANDOM_STATE
    p["n_jobs"]            = -1
    return p

# Warm-start con BEST previo
if BEST_HP_FILE.exists():
    try:
        prev = json.loads(BEST_HP_FILE.read_text())
        warm = {k: prev[k] for k in SEARCH_KEYS if k in prev}
        if warm:
            print("[OPTUNA] Enqueuing previous BEST as a trial seed.")
            study.enqueue_trial(warm)
    except Exception as e:
        print("[OPTUNA] Aviso: no se pudo usar BEST para warm-start:", e)

def objective(trial):
    hp = suggest_rf_params(trial)
    mdl = RandomForestClassifier(**{**seed_params, **hp})

    # Entrenamos con el mismo esquema que el baseline
    mdl.fit(X_train_final, y_train_final)
    proba_val_t = mdl.predict_proba(X_val_fit)[:, 1]
    ap = average_precision_score(y_val, proba_val_t)
    return ap

print(f"[OPTUNA] Iniciando estudio '{STUDY_NAME}' con {N_TRIALS} pruebas...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

best = study.best_trial
print(f"[OPTUNA] Mejor AP(val): {best.value:.6f}")
print(f"[OPTUNA] Params ganadores:", best.params)

best_params = dict(best.params)
best_params.update({
    "random_state": RANDOM_STATE,
    "n_jobs": -1
})
with open(BEST_HP_FILE, "w", encoding="utf-8") as f:
    json.dump(best_params, f, indent=2, ensure_ascii=False)
print("[OPTUNA] Guardado BEST en:", BEST_HP_FILE.name)

tuned_model = RandomForestClassifier(**best_params)
tuned_model.fit(X_train_final, y_train_final)
print("[OPTUNA] Reentreno final completado.")

[I 2025-12-12 23:18:20,748] A new study created in memory with name: RF_FULL_SMOTENC_AP


[OPTUNA] Enqueuing previous BEST as a trial seed.
[OPTUNA] Iniciando estudio 'RF_FULL_SMOTENC_AP' con 40 pruebas...


[I 2025-12-12 23:18:25,176] Trial 0 finished with value: 0.6767628484566041 and parameters: {'n_estimators': 1650, 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 0.5, 'bootstrap': True, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6767628484566041.
[I 2025-12-12 23:18:26,887] Trial 1 finished with value: 0.6580992081835771 and parameters: {'n_estimators': 850, 'max_depth': None, 'min_samples_split': 22, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False, 'criterion': 'gini', 'class_weight': None}. Best is trial 0 with value: 0.6767628484566041.
[I 2025-12-12 23:18:27,994] Trial 2 finished with value: 0.6452514043365771 and parameters: {'n_estimators': 850, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 19, 'max_features': 'sqrt', 'bootstrap': True, 'criterion': 'gini', 'class_weight': None}. Best is trial 0 with value: 0.6767628484566041.
[I 2025-12-12 23:18:30,644] Trial 3 finished wi

[OPTUNA] Mejor AP(val): 0.676763
[OPTUNA] Params ganadores: {'n_estimators': 1650, 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 0.5, 'bootstrap': True, 'criterion': 'entropy', 'class_weight': 'balanced'}
[OPTUNA] Guardado BEST en: BEST_RF_FULL_SMOTENC.json
[OPTUNA] Reentreno final completado.


8 — Cross-Validation (OOF) para baseline y tuned

In [25]:
def run_oof_cv_rf(model_params, X, y, feature_names_in, k_folds=CV_FOLDS, seed=RANDOM_STATE, exp_suffix="BASELINE"):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    oof_proba = np.zeros_like(y, dtype=float)
    fold_rows = []

    base = dict(model_params)
    base.setdefault("random_state", seed)
    base.setdefault("n_jobs", -1)
    base.setdefault("n_estimators", 500)

    for f, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr0, X_va0 = X[tr_idx], X[va_idx]
        y_tr0, y_va0 = y[tr_idx], y[va_idx]

        feat_names_fold = list(feature_names_in)
        keep_idx = None
        if USE_REDUCED:
            keep_idx, _ = fit_mi_selector(X_tr0, y_tr0, topk=MI_TOPK, seed=seed)
            X_tr0 = apply_keep_idx(X_tr0, keep_idx)
            X_va0 = apply_keep_idx(X_va0, keep_idx)
            feat_names_fold = [feat_names_fold[i] for i in keep_idx]

        # --- Balanceo por fold con SMOTENC ---
        if BALANCE_IN_CV and USE_BALANCED_TRAIN:
            X_tr, y_tr = maybe_smotenc(X_tr0, y_tr0, feat_names_fold, seed=seed)
        else:
            X_tr, y_tr = X_tr0, y_tr0

        # Sanidad
        assert X_tr.shape[1] == len(feat_names_fold), f"[CV fold={f}] Desalineado tras SMOTENC."
        assert X_va0.shape[1] == len(feat_names_fold), f"[CV fold={f}] Desalineado val tras selección."

        mdl = RandomForestClassifier(**base)
        mdl.fit(X_tr, y_tr)
        proba_va = mdl.predict_proba(X_va0)[:, 1]
        oof_proba[va_idx] = proba_va

        fold_rows.append({
            "fold": f,
            "pr_auc": average_precision_score(y_va0, proba_va),
            "roc_auc": roc_auc_score(y_va0, proba_va)
        })

    oof_pr = average_precision_score(y, oof_proba)
    oof_roc = roc_auc_score(y, oof_proba)
    thr_oof, _ = find_best_threshold(y, oof_proba, metric="f1")
    y_oof_pred = (oof_proba >= thr_oof).astype(int)
    oof_f1  = f1_score(y, y_oof_pred, zero_division=0)
    oof_rec = recall_score(y, y_oof_pred, zero_division=0)
    oof_bal = balanced_accuracy_score(y, y_oof_pred)

    cv_tag = f"{EXP_NAME}_{exp_suffix}_CV{CV_FOLDS}"
    cv_csv = OUT_RESULTS / f"cv_summary_{cv_tag}.csv"
    folds_df = pd.DataFrame(fold_rows)
    agg_row = pd.DataFrame([{
        "fold": "OOF", "pr_auc": oof_pr, "roc_auc": oof_roc,
        "thr": thr_oof, "f1": oof_f1, "recall": oof_rec, "bal_acc": oof_bal
    }])
    pd.concat([folds_df, agg_row], ignore_index=True).to_csv(cv_csv, index=False)

    oof_path = OUT_PREDS / f"oof_{cv_tag}.parquet"
    pd.DataFrame({"oof_proba": oof_proba, "y_true": y}).to_parquet(oof_path, index=False)

    print(f"[CV-{exp_suffix}] Guardados: {cv_csv.name} | {oof_path.name}")
    return {"oof_pr_auc": oof_pr, "oof_roc_auc": oof_roc, "thr": thr_oof,
            "oof_f1": oof_f1, "oof_recall": oof_rec, "oof_bal_acc": oof_bal}

cv_baseline = None
cv_tuned = None

if DO_CV_BASELINE:
    cv_baseline = run_oof_cv_rf(seed_params, X_train_fit, y_train, feature_names_used, exp_suffix="BASELINE")

if DO_CV_TUNED and tuned_model is not None:
    cv_tuned = run_oof_cv_rf(best_params, X_train_fit, y_train, feature_names_used, exp_suffix="TUNED")

[CV-BASELINE] Guardados: cv_summary_RF_FULL_SMOTENC_BASELINE_CV5.csv | oof_RF_FULL_SMOTENC_BASELINE_CV5.parquet
[CV-TUNED] Guardados: cv_summary_RF_FULL_SMOTENC_TUNED_CV5.csv | oof_RF_FULL_SMOTENC_TUNED_CV5.parquet


9 — Evaluación en test + guardados (figuras, importancias, preds, baseline.csv)

In [26]:
def plot_pr_curve(y_true, y_proba, title, out_path):
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.step(rec, prec, where='post')
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title(f'{title} (AP={ap:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_roc_curve(y_true, y_proba, title, out_path):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2)
    plt.plot([0,1],[0,1], 'k--', lw=1)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title(f'{title} (AUC={auc:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_confusion(y_true, y_pred, title, out_path, normalize=False):
    norm = 'true' if normalize else None
    cm = confusion_matrix(y_true, y_pred, normalize=norm)
    plt.figure(figsize=(5,4))
    im = plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    ticks = np.arange(2)
    plt.xticks(ticks, ['0','1']); plt.yticks(ticks, ['0','1'])
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            txt = f'{cm[i,j]:.2f}' if normalize else str(cm[i,j])
            plt.text(j, i, txt, ha='center', va='center',
                     color='white' if cm[i,j] > thresh else 'black')
    plt.ylabel('True label'); plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def rf_importances(model, feature_names):
    try:
        imp = np.asarray(model.feature_importances_, dtype=float)
    except Exception:
        imp = np.zeros(len(feature_names), dtype=float)
    return pd.DataFrame({"feature": feature_names[:len(imp)], "importance_gain": imp})

# ——— Evaluación y guardados ———
base = EXP_NAME

# BASELINE
preds_val_base_path = OUT_PREDS / f"preds_val_{base}.parquet"
pd.DataFrame({"proba": proba_val, "y_true": y_val}).to_parquet(
    preds_val_base_path, index=False
)

# Guardar HP baseline (seed y fitted)
params_seed_path = OUT_PARAMS / f"{base}_BASE_seed_params.json"
with open(params_seed_path, "w", encoding="utf-8") as f:
    json.dump(seed_params, f, indent=2, ensure_ascii=False)

params_fitted_path = OUT_PARAMS / f"{base}_BASE_fitted_params.json"
with open(params_fitted_path, "w", encoding="utf-8") as f:
    json.dump(model.get_params(), f, indent=2, ensure_ascii=False)

# Figuras baseline
plot_pr_curve(y_val,  proba_val,  f"{base} — PR (val)",  OUT_FIGS / f"{base}_pr_val.png")
plot_pr_curve(y_test, proba_test, f"{base} — PR (test)", OUT_FIGS / f"{base}_pr_test.png")
plot_roc_curve(y_val,  proba_val,  f"{base} — ROC (val)",  OUT_FIGS / f"{base}_roc_val.png")
plot_roc_curve(y_test, proba_test, f"{base} — ROC (test)", OUT_FIGS / f"{base}_roc_test.png")
plot_confusion(y_test, y_pred_test, f"{base} — Confusion (test @thr={thr_val:.3f})", OUT_FIGS / f"{base}_cm_test.png")

# Importancias baseline
imp_df = rf_importances(model, feature_names_used).sort_values("importance_gain", ascending=False)
imp_path = OUT_RESULTS / f"{base}_feature_importances.csv"
imp_df.to_csv(imp_path, index=False)

# Preds test baseline
preds_path = OUT_PREDS / f"preds_test_{base}.parquet"
pd.DataFrame({"proba": proba_test, "y_true": y_test}).to_parquet(preds_path, index=False)

# Registro a baselines.csv
row_base = {
    "model": base,
    "thr_val": thr_val,
    "val_pr_auc": val_metrics["pr_auc"],
    "val_roc_auc": val_metrics["roc_auc"],
    "val_precision": val_metrics["precision"],
    "val_f1": val_metrics["f1"],
    "val_recall": val_metrics["recall"],
    "val_bal_acc": val_metrics["bal_acc"],
    "test_pr_auc": test_metrics["pr_auc"],
    "test_roc_auc": test_metrics["roc_auc"],
    "test_precision": test_metrics["precision"],
    "test_f1": test_metrics["f1"],
    "test_recall": test_metrics["recall"],
    "test_bal_acc": test_metrics["bal_acc"],
    "best_iteration": np.nan
}
res_csv = OUT_RESULTS / "baselines.csv"
pd.DataFrame([row_base]).to_csv(res_csv, mode=("a" if res_csv.exists() else "w"),
                                index=False, header=not res_csv.exists())

print("[OK][BASE] Guardados:",
      "\n  - Seed HPs   :", params_seed_path.name,
      "\n  - Fitted HPs :", params_fitted_path.name,
      "\n  - Importancias:", imp_path.name,
      "\n  - Preds test  :", preds_path.name,
      "\n  - Baselines   :", res_csv.name)

# TUNED
if tuned_model is not None:
    proba_val_tuned = tuned_model.predict_proba(X_val_fit)[:, 1]
    thr_val_tuned, _ = find_best_threshold(y_val, proba_val_tuned, metric="f1")
    val_metrics_tuned = compute_all_metrics(y_val, proba_val_tuned, thr_val_tuned)

    proba_test_tuned = tuned_model.predict_proba(X_test_fit)[:, 1]
    y_pred_test_tuned = (proba_test_tuned >= thr_val_tuned).astype(int)
    test_metrics_tuned = compute_all_metrics(y_test, proba_test_tuned, thr_val_tuned)

    tuned_fitted_path = OUT_PARAMS / f"{base}_TUNED_fitted_params.json"
    with open(tuned_fitted_path, "w", encoding="utf-8") as f:
        json.dump(tuned_model.get_params(), f, indent=2, ensure_ascii=False)

    base_t = base + "_TUNED"
    preds_val_tuned_path = OUT_PREDS / f"preds_val_{base_t}.parquet"
    pd.DataFrame({"proba": proba_val_tuned, "y_true": y_val}).to_parquet(
        preds_val_tuned_path, index=False
    )

    plot_pr_curve(y_val,  proba_val_tuned,  f"{base_t} — PR (val)",  OUT_FIGS / f"{base_t}_pr_val.png")
    plot_pr_curve(y_test, proba_test_tuned, f"{base_t} — PR (test)", OUT_FIGS / f"{base_t}_pr_test.png")
    plot_roc_curve(y_val,  proba_val_tuned,  f"{base_t} — ROC (val)",  OUT_FIGS / f"{base_t}_roc_val.png")
    plot_roc_curve(y_test, proba_test_tuned, f"{base_t} — ROC (test)", OUT_FIGS / f"{base_t}_roc_test.png")
    plot_confusion(y_test, y_pred_test_tuned, f"{base_t} — Confusion (test @thr={thr_val_tuned:.3f})", OUT_FIGS / f"{base_t}_cm_test.png")

    imp_t_path = OUT_RESULTS / f"{base_t}_feature_importances.csv"
    rf_importances(tuned_model, feature_names_used).sort_values("importance_gain", ascending=False).to_csv(imp_t_path, index=False)
    preds_t_path = OUT_PREDS / f"preds_test_{base_t}.parquet"
    pd.DataFrame({"proba": proba_test_tuned, "y_true": y_test}).to_parquet(preds_t_path, index=False)

    row_t = {
        "model": base_t,
        "thr_val": thr_val_tuned,
        "val_pr_auc": val_metrics_tuned["pr_auc"],
        "val_roc_auc": val_metrics_tuned["roc_auc"],
        "val_precision": val_metrics_tuned["precision"],
        "val_f1": val_metrics_tuned["f1"],
        "val_recall": val_metrics_tuned["recall"],
        "val_bal_acc": val_metrics_tuned["bal_acc"],
        "test_pr_auc": test_metrics_tuned["pr_auc"],
        "test_roc_auc": test_metrics_tuned["roc_auc"],
        "test_precision": test_metrics_tuned["precision"],
        "test_f1": test_metrics_tuned["f1"],
        "test_recall": test_metrics_tuned["recall"],
        "test_bal_acc": test_metrics_tuned["bal_acc"],
        "best_iteration": np.nan
    }
    pd.DataFrame([row_t]).to_csv(res_csv, mode="a", index=False, header=False)

    print("[OK][TUNED] Guardados:",
          "\n  - Fitted HPs :", tuned_fitted_path.name,
          "\n  - Importancias:", imp_t_path.name,
          "\n  - Preds test  :", preds_t_path.name,
          "\n  - Baselines   :", res_csv.name)

[OK][BASE] Guardados: 
  - Seed HPs   : RF_FULL_SMOTENC_BASE_seed_params.json 
  - Fitted HPs : RF_FULL_SMOTENC_BASE_fitted_params.json 
  - Importancias: RF_FULL_SMOTENC_feature_importances.csv 
  - Preds test  : preds_test_RF_FULL_SMOTENC.parquet 
  - Baselines   : baselines.csv
[OK][TUNED] Guardados: 
  - Fitted HPs : RF_FULL_SMOTENC_TUNED_fitted_params.json 
  - Importancias: RF_FULL_SMOTENC_TUNED_feature_importances.csv 
  - Preds test  : preds_test_RF_FULL_SMOTENC_TUNED.parquet 
  - Baselines   : baselines.csv


10 — Mejores resultados + resumen CV (y comparación SOTA RF de la literatura)

In [27]:
AGGREGATE_ALL_RUNS = False

def safe(v, fmt=".4f"):
    try:
        return f"{float(v):{fmt}}"
    except Exception:
        return "NA"

base_csv = OUT_RESULTS / "baselines.csv"
if not base_csv.exists():
    raise FileNotFoundError(f"No existe {base_csv}")

df = pd.read_csv(base_csv)

needed = [
    "model","thr_val",
    "val_pr_auc","val_roc_auc","val_precision","val_f1","val_recall","val_bal_acc",
    "test_pr_auc","test_roc_auc","test_precision","test_f1","test_recall","test_bal_acc",
    "best_iteration"
]
for c in needed:
    if c not in df.columns:
        df[c] = pd.NA

df = df[needed].copy()
num_cols = [c for c in needed if c not in ("model",)]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

if AGGREGATE_ALL_RUNS:
    root_art = ARTIF_DIR.parent
    for p in (root_art).glob("RF_*/results/baselines.csv"):
        if p == base_csv:
            continue
        try:
            d2 = pd.read_csv(p)
            for c in needed:
                if c not in d2.columns:
                    d2[c] = pd.NA
            d2 = d2[needed]
            for c in num_cols:
                d2[c] = pd.to_numeric(d2[c], errors="coerce")
            df = pd.concat([df, d2], ignore_index=True)
        except Exception:
            pass

if df.empty:
    raise ValueError("El dataframe de resultados está vacío.")

df = df.drop_duplicates(subset=["model"], keep="last").copy()

def best_by(metric):
    if metric not in df.columns or df[metric].dropna().empty:
        return None
    r = df.loc[df[metric].idxmax()]
    print(
        f"- {metric}: {r['model']} | "
        f"PR-AUC={safe(r['test_pr_auc'])} | "
        f"ROC-AUC={safe(r['test_roc_auc'])} | "
        f"F1={safe(r['test_f1'])} | "
        f"Recall={safe(r['test_recall'])} | "
        f"Precision={safe(r['test_precision'])} | "
        f"thr(val)={safe(r['thr_val'], '.3f')} | "
        f"best_iter={int(r['best_iteration']) if pd.notna(r['best_iteration']) else 'NA'}"
    )
    return r

print("=== MEJORES EN TEST (por métrica) ===")
winners = {}
for m in ["test_pr_auc","test_roc_auc","test_recall","test_f1","test_precision"]:
    w = best_by(m)
    if w is not None:
        winners[m] = w

cv_files = list(OUT_RESULTS.glob("cv_summary_*_CV*.csv"))
if cv_files:
    print("=== RESUMEN CV-OOF (por experimento) ===")
    rows = []
    for f in cv_files:
        tag = re.sub(r"^cv_summary_|\.csv$", "", f.name)
        cv = pd.read_csv(f)
        oof = cv.loc[cv["fold"] == "OOF"]
        if not oof.empty:
            r = oof.iloc[0]
            rows.append({
                "tag": tag,
                "pr_auc": r.get("pr_auc"),
                "roc_auc": r.get("roc_auc"),
                "f1": r.get("f1"),
                "recall": r.get("recall"),
                "bal_acc": r.get("bal_acc"),
                "thr": r.get("thr"),
            })
    if rows:
        print(pd.DataFrame(rows).sort_values(["pr_auc","roc_auc"], ascending=False).to_string(index=False))
else:
    print("(No se hallaron archivos de CV para este experimento)")

# Comparación rápida con un reporte de RF en la literatura (Tekouabou et al., 2022): F1≈0.86 con RF+SMOTE en Kaggle Bank Churn
SOTA_RF = {
    "F1": 0.86,
    "source": "Tékouabou et al. (2022) — Mathematics: RF + SMOTE en Kaggle Bank Churn"
}

if "test_f1" in winners and winners["test_f1"] is not None:
    bt = winners["test_f1"]
    d_f1 = float(bt["test_f1"]) - SOTA_RF["F1"]
    print("=== COMPARACIÓN SOTA RF vs. MEJOR TEST ===")
    print(f"Paper RF: F1={SOTA_RF['F1']:.4f}")
    print(f"Tu mejor: F1={safe(bt['test_f1'])}")
    print(f"Deltas  : ΔF1={d_f1:+.4f}")
    print(f"Fuente  : {SOTA_RF['source']}")
else:
    print("No se pudo localizar el ganador por F1 para comparar contra SOTA RF.")

backup = OUT_RESULTS / "baselines_legacy_backup.csv"
base_csv.replace(backup)
df.to_csv(base_csv, index=False)
print("[OK] Normalizado. Backup:", backup.name)

=== MEJORES EN TEST (por métrica) ===
- test_pr_auc: RF_FULL_SMOTENC_TUNED | PR-AUC=0.6909 | ROC-AUC=0.8579 | F1=0.6150 | Recall=0.6437 | Precision=0.5888 | thr(val)=0.538 | best_iter=NA
- test_roc_auc: RF_FULL_SMOTENC_TUNED | PR-AUC=0.6909 | ROC-AUC=0.8579 | F1=0.6150 | Recall=0.6437 | Precision=0.5888 | thr(val)=0.538 | best_iter=NA
- test_recall: RF_FULL_SMOTENC_TUNED | PR-AUC=0.6909 | ROC-AUC=0.8579 | F1=0.6150 | Recall=0.6437 | Precision=0.5888 | thr(val)=0.538 | best_iter=NA
- test_f1: RF_FULL_SMOTENC_TUNED | PR-AUC=0.6909 | ROC-AUC=0.8579 | F1=0.6150 | Recall=0.6437 | Precision=0.5888 | thr(val)=0.538 | best_iter=NA
- test_precision: RF_FULL_SMOTENC_TUNED | PR-AUC=0.6909 | ROC-AUC=0.8579 | F1=0.6150 | Recall=0.6437 | Precision=0.5888 | thr(val)=0.538 | best_iter=NA
=== RESUMEN CV-OOF (por experimento) ===
                         tag   pr_auc  roc_auc       f1   recall  bal_acc   thr
RF_FULL_SMOTENC_BASELINE_CV5 0.667272 0.846711 0.604311 0.607522  0.75216 0.579
   RF_FULL_SMOTE