1 — Imports, configuración y rutas (DL híbrido sin reducción)

In [8]:
import json, os, warnings, time, re, glob, math, random
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, recall_score, balanced_accuracy_score, confusion_matrix, precision_score
)
from sklearn.model_selection import StratifiedKFold

# Balanceo
try:
    from imblearn.over_sampling import SMOTENC
    _HAS_IMBLEARN = True
except Exception:
    _HAS_IMBLEARN = False

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# === Toggles de experimento ===
USE_REDUCED = False               
USE_BALANCED_TRAIN = True       
BALANCE_IN_CV = True              
RANDOM_STATE = 42
DO_TUNE = True
DO_CV_BASELINE = True
DO_CV_TUNED = True
CV_FOLDS = 5

# === Nombres y rutas ===
ROOT = Path.cwd().parent
EXP_NAME = f"DL_{'REDUCED' if USE_REDUCED else 'FULL'}_{'SMOTENC' if USE_BALANCED_TRAIN else 'IMB'}"
ARTIF_DIR = ROOT / "artifacts" / EXP_NAME
OUT_RESULTS = ARTIF_DIR / "results"
OUT_FIGS    = ARTIF_DIR / "figs"
OUT_PREDS   = ARTIF_DIR / "preds"
OUT_PARAMS  = ARTIF_DIR / "best_params"
for p in [OUT_RESULTS, OUT_FIGS, OUT_PREDS, OUT_PARAMS]:
    p.mkdir(parents=True, exist_ok=True)

# Dataset preprocesado
DATA_DIR = ROOT / "preproc_datasets" / "full"

print("Exp:", EXP_NAME)
print("DATA_DIR:", DATA_DIR)
print("ARTIF_DIR:", ARTIF_DIR)

# Seeds globales
def set_seeds(seed=RANDOM_STATE):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seeds(RANDOM_STATE)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

Exp: DL_FULL_SMOTENC
DATA_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/preproc_datasets/full
ARTIF_DIR: /Users/luistejada/Downloads/TFE Churn Bancario/artifacts/DL_FULL_SMOTENC
DEVICE: cpu


2 — Carga de artefactos (X, y, features)

In [9]:
def load_xy_full(dir_full: Path):
    X_train = np.load(dir_full / "X_train_full.npy")
    X_val   = np.load(dir_full / "X_val_full.npy")
    X_test  = np.load(dir_full / "X_test_full.npy")

    y_train = pd.read_parquet(dir_full / "y_train.parquet")["Exited"].to_numpy()
    y_val   = pd.read_parquet(dir_full / "y_val.parquet")["Exited"].to_numpy()
    y_test  = pd.read_parquet(dir_full / "y_test.parquet")["Exited"].to_numpy()

    feat = pd.read_parquet(dir_full / "feature_names_full.parquet")["feature"].tolist()
    return X_train, y_train, X_val, y_val, X_test, y_test, feat

X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_xy_full(DATA_DIR)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y train/val/test:", y_train.shape, y_val.shape, y_test.shape)
print("n features:", len(feature_names))

# Tipos consistentes
X_train = X_train.astype(np.float32)
X_val   = X_val.astype(np.float32)
X_test  = X_test.astype(np.float32)
y_train = y_train.astype(np.int64)
y_val   = y_val.astype(np.int64)
y_test  = y_test.astype(np.int64)

PREPROC_META = DATA_DIR / "preprocessor_meta.json"
cat_cols_order = None
if PREPROC_META.exists():
    try:
        with open(PREPROC_META, "r", encoding="utf-8") as f:
            _meta = json.load(f)
        cat_cols_order = _meta.get("cat_cols", None)
    except Exception:
        cat_cols_order = None

# Índices numéricos (prefijo 'num__') y OHE (resto)
num_idx = [i for i, n in enumerate(feature_names) if str(n).startswith("num__")]
ohe_idx = [i for i in range(len(feature_names)) if i not in num_idx]

from collections import defaultdict
base_to_idx = defaultdict(list)
for i in ohe_idx:
    base = str(feature_names[i]).split("_", 1)[0]  # <col>_<cat> -> <col>
    base_to_idx[base].append(i)

# Orden estable
for k in base_to_idx:
    base_to_idx[k] = sorted(base_to_idx[k])

if cat_cols_order:
    ONEHOT_GROUPS = [base_to_idx[c] for c in cat_cols_order if c in base_to_idx]
else:
    ONEHOT_GROUPS = [base_to_idx[k] for k in sorted(base_to_idx.keys(), key=lambda k: min(base_to_idx[k]))]

CAT_IDX = sorted([j for grp in ONEHOT_GROUPS for j in grp])

print(f"[SMOTENC] {len(ONEHOT_GROUPS)} grupos OHE; {len(CAT_IDX)} dims categóricas; {len(num_idx)} numéricas")

Shapes: (6000, 15) (2000, 15) (2000, 15)
y train/val/test: (6000,) (2000,) (2000,)
n features: 15
[SMOTENC] 5 grupos OHE; 10 dims categóricas; 5 numéricas


3 — Métricas, threshold y plots

In [10]:
def pr_auc(y_true, y_proba):
    return float(average_precision_score(y_true, y_proba))

def roc_auc(y_true, y_proba):
    return float(roc_auc_score(y_true, y_proba))

def find_best_threshold(y_true, y_proba, metric="f1"):
    thr_grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for thr in thr_grid:
        y_pred = (y_proba >= thr).astype(int)
        if metric == "f1":
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == "recall":
            score = recall_score(y_true, y_pred, zero_division=0)
        else:
            raise ValueError("metric no soportada")
        if score > best_score:
            best_score, best_thr = score, thr
    return float(best_thr), float(best_score)

def compute_all_metrics(y_true, y_proba, thr):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "pr_auc": pr_auc(y_true, y_proba),
        "roc_auc": roc_auc(y_true, y_proba),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "bal_acc": balanced_accuracy_score(y_true, y_pred)
    }

def plot_pr_curve(y_true, y_proba, title, out_path):
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.step(rec, prec, where='post')
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title(f'{title} (AP={ap:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_roc_curve(y_true, y_proba, title, out_path):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2)
    plt.plot([0,1],[0,1], 'k--', lw=1)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title(f'{title} (AUC={auc:.4f})')
    plt.grid(True, linestyle='--', alpha=.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def plot_confusion(y_true, y_pred, title, out_path, normalize=False):
    norm = 'true' if normalize else None
    cm = confusion_matrix(y_true, y_pred, normalize=norm)
    plt.figure(figsize=(5,4))
    im = plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    ticks = np.arange(2)
    plt.xticks(ticks, ['0','1']); plt.yticks(ticks, ['0','1'])
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            txt = f'{cm[i,j]:.2f}' if normalize else str(cm[i,j])
            plt.text(j, i, txt, ha='center', va='center',
                     color='white' if cm[i,j] > thresh else 'black')
    plt.ylabel('True label'); plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

4 — Helpers: SMOTE, Dataset y utilidades

In [11]:
def _repair_onehot_blocks(X, groups):
    X = X.copy()
    if not groups:
        return X.astype(np.float32)
    rows = np.arange(X.shape[0])
    for grp in groups:
        if len(grp) == 1:
            c = grp[0]
            X[:, c] = (X[:, c] >= 0.5).astype(np.float32)
        else:
            block = X[:, grp]
            winners = np.argmax(block, axis=1)
            X[:, grp] = 0.0
            X[rows, np.array(grp)[winners]] = 1.0
    return X.astype(np.float32)

def maybe_resample(X, y, seed=RANDOM_STATE):

    if not _HAS_IMBLEARN or X.shape[0] != y.shape[0] or len(CAT_IDX) == 0:
        return X, y
    try:
        sm = SMOTENC(categorical_features=CAT_IDX, random_state=seed)
        Xb, yb = sm.fit_resample(X, y)
        Xb = _repair_onehot_blocks(Xb, ONEHOT_GROUPS)
        return Xb.astype(np.float32), yb.astype(np.int64)
    except Exception as e:
        print("[SMOTENC] Aviso: se usará dataset original por error:", e)
        return X, y

class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

def class_pos_weight(y):
    # pos_weight = N_neg / N_pos
    y = np.asarray(y)
    n_pos = (y == 1).sum()
    n_neg = (y == 0).sum()
    if n_pos == 0:
        return 1.0
    return float(n_neg / max(1, n_pos))

5 — Modelo DL híbrido (Self-Attention + BiLSTM + CNN) y entrenamiento con early stopping

In [12]:
class ScalarFeatureTokenizer(nn.Module):
    """
    Proyecta cada feature escalar a un token de dimensión d_model:
    token_i = x_i * W_i + b_i  (W_i y b_i aprendibles por feature)
    """
    def __init__(self, n_features, d_model):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(n_features, d_model) * 0.02)
        self.bias   = nn.Parameter(torch.zeros(n_features, d_model))
    def forward(self, x):  # x: (B, F)
        return x.unsqueeze(-1) * self.weight + self.bias  # (B, F, d_model)

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff=256, dropout=0.1):
        super().__init__()
        self.mha = nn.MultiheadAttention(embed_dim=d_model, num_heads=n_heads, dropout=dropout, batch_first=True)
        self.ln1 = nn.LayerNorm(d_model)
        self.ff  = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_ff, d_model)
        )
        self.ln2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):  # (B, F, d)
        attn_out, _ = self.mha(x, x, x, need_weights=False)
        x = self.ln1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.ln2(x + self.dropout(ff_out))
        return x

class SELayer1D(nn.Module):
    """Squeeze-and-Excitation sobre canales (dim de canales tras Conv1d)."""
    def __init__(self, channels, reduction=8):
        super().__init__()
        hidden = max(1, channels // reduction)
        self.fc = nn.Sequential(
            nn.Linear(channels, hidden), nn.ReLU(inplace=True), nn.Linear(hidden, channels), nn.Sigmoid()
        )
    def forward(self, x):  
        s = x.mean(dim=-1)  
        w = self.fc(s)      
        return x * w.unsqueeze(-1)

class CCPNetLite(nn.Module):
    """
    Híbrido simple: Tokenizer -> N bloques Transformer -> BiLSTM -> Conv1d + SE -> Pool -> Head
    Pensado para tabular (tratando cada feature como token).
    """
    def __init__(self, n_features, d_model=48, n_heads=4, n_layers=2, lstm_hidden=64,
                 cnn_channels=64, kernel_size=3, dropout=0.2):
        super().__init__()
        self.n_features = n_features
        self.tokenizer = ScalarFeatureTokenizer(n_features, d_model)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model=d_model, n_heads=n_heads, d_ff=4*d_model, dropout=dropout)
            for _ in range(n_layers)
        ])
        self.bi_lstm = nn.LSTM(input_size=d_model, hidden_size=lstm_hidden, batch_first=True, bidirectional=True)
        conv_in = lstm_hidden * 2
        pad = kernel_size // 2
        self.conv = nn.Conv1d(conv_in, cnn_channels, kernel_size=kernel_size, padding=pad)
        self.se   = SELayer1D(cnn_channels)
        self.head = nn.Sequential(
            nn.Linear(cnn_channels*2, 128), nn.ReLU(inplace=True), nn.Dropout(dropout), nn.Linear(128, 1)
        )
    def forward(self, x): 
        t = self.tokenizer(x)                
        for blk in self.blocks:
            t = blk(t)                        
        lstm_out, _ = self.bi_lstm(t)         
        z = lstm_out.transpose(1, 2)          
        z = self.conv(z)                      
        z = F.gelu(z)
        z = self.se(z)                     
        # Global avg + max pooling
        gap = z.mean(dim=-1)
        gmp, _ = z.max(dim=-1)
        g = torch.cat([gap, gmp], dim=1)     
        logit = self.head(g).squeeze(1)      
        return logit
    
    def feature_importance(self, feature_names):
        with torch.no_grad():
            w = self.tokenizer.weight.detach().cpu().numpy() 
            imp = np.linalg.norm(w, axis=1)
        return pd.DataFrame({"feature": feature_names, "importance_proxy": imp}).sort_values("importance_proxy", ascending=False)

def get_dl_defaults(seed=RANDOM_STATE):
    return {
        "d_model": 48,
        "n_heads": 4,
        "n_layers": 2,
        "lstm_hidden": 64,
        "cnn_channels": 64,
        "kernel_size": 3,
        "dropout": 0.2,
        "lr": 1e-3,
        "weight_decay": 1e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 12,
        "random_state": seed
    }

def make_model(n_features, hp):
    mdl = CCPNetLite(
        n_features=n_features,
        d_model=int(hp["d_model"]),
        n_heads=int(hp["n_heads"]),
        n_layers=int(hp["n_layers"]),
        lstm_hidden=int(hp["lstm_hidden"]),
        cnn_channels=int(hp["cnn_channels"]),
        kernel_size=int(hp["kernel_size"]),
        dropout=float(hp["dropout"]) 
    ).to(DEVICE)
    return mdl

def train_one(model, X_tr, y_tr, X_va, y_va, hp, verbose=False):
    set_seeds(RANDOM_STATE)
    bs = int(hp["batch_size"]) if "batch_size" in hp else 256
    epochs = int(hp.get("epochs", 100))
    patience = int(hp.get("patience", 12))
    lr = float(hp.get("lr", 1e-3))
    wd = float(hp.get("weight_decay", 1e-4))

    did_smote = hp.get("_did_smote", False)
    pw = 1.0 if did_smote else class_pos_weight(y_tr)
    pos_w = torch.tensor([pw], dtype=torch.float32, device=DEVICE)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_w)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

    dl_tr = DataLoader(TabDataset(X_tr, y_tr), batch_size=bs, shuffle=True, num_workers=0, pin_memory=False)
    dl_va = DataLoader(TabDataset(X_va, y_va), batch_size=bs, shuffle=False, num_workers=0, pin_memory=False)

    best_ap = -1.0
    best_epoch = -1
    best_state = None
    wait = 0

    for ep in range(1, epochs+1):
        model.train()
        running = 0.0
        for xb, yb in dl_tr:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            optimizer.zero_grad(set_to_none=True)
            logit = model(xb)
            loss = criterion(logit, yb)
            loss.backward()
            optimizer.step()
            running += float(loss.item())

        # Validación
        model.eval()
        all_probs = []
        with torch.no_grad():
            for xb, yb in dl_va:
                xb = xb.to(DEVICE)
                logit = model(xb)
                prob = torch.sigmoid(logit).detach().cpu().numpy()
                all_probs.append(prob)
        va_proba = np.concatenate(all_probs, axis=0)
        ap = average_precision_score(y_va, va_proba)

        if verbose and ep % 10 == 0:
            print(f"[EP {ep:03d}] loss={running/len(dl_tr):.4f} | AP(val)={ap:.4f}")

        # Early stopping por AP
        if ap > best_ap + 1e-6:
            best_ap = ap
            best_epoch = ep
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break

    # Restaurar mejor estado
    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_epoch, best_ap

def predict_proba(model, X, batch_size=512):
    model.eval()
    dl = DataLoader(TabDataset(X, None), batch_size=batch_size, shuffle=False)
    probs = []
    with torch.no_grad():
        for xb in dl:
            xb = xb.to(DEVICE)
            logit = model(xb)
            prob = torch.sigmoid(logit).detach().cpu().numpy()
            probs.append(prob)
    return np.concatenate(probs, axis=0)

6 — Hiperparámetros persistentes (carga/guardado)

In [13]:
VIEW_TAG = "REDUCED" if USE_REDUCED else "FULL"
BAL_TAG  = "SMOTENC" if USE_BALANCED_TRAIN else "IMB"
BEST_HP_FILE = OUT_PARAMS / f"BEST_DL_{VIEW_TAG}_{BAL_TAG}.json"

def load_best_or_default():
    base = get_dl_defaults()
    if BEST_HP_FILE.exists():
        try:
            best = json.loads(BEST_HP_FILE.read_text())
            print("[HP] Cargando mejores hiperparámetros previos:", BEST_HP_FILE.name)
            base.update(best)
            return base, True
        except Exception as e:
            print("[HP] Aviso: no se pudo leer BEST (uso defaults).", e)
    print("[HP] Usando hiperparámetros DEFAULT de DL.")
    return base, False

seed_params, loaded_best_flag = load_best_or_default()

[HP] Cargando mejores hiperparámetros previos: BEST_DL_FULL_SMOTENC.json


7 — Entrenamiento BASELINE + umbral (DL)

In [14]:
set_seeds(RANDOM_STATE)

feature_names_used = feature_names
X_train_fit, X_val_fit, X_test_fit = X_train, X_val, X_test

X_train_final, y_train_final = X_train_fit, y_train
did_smote_flag = False
if USE_BALANCED_TRAIN:
    X_train_final, y_train_final = maybe_resample(X_train_fit, y_train)
    did_smote_flag = True

base_hp = dict(seed_params)
base_hp["_did_smote"] = did_smote_flag
model = make_model(n_features=X_train_final.shape[1], hp=base_hp)
model, best_epoch = train_one(model, X_train_final, y_train_final, X_val_fit, y_val, base_hp, verbose=False)[:2]
print(f"[BASELINE] best_epoch: {best_epoch}")

proba_val = predict_proba(model, X_val_fit)
thr_val, best_f1_val = find_best_threshold(y_val, proba_val, metric="f1")
# Guardar predicciones de validación (baseline)
val_preds_path = OUT_PREDS / f"preds_val_{EXP_NAME}.parquet"
pd.DataFrame({
    "proba": proba_val,
    "y_true": y_val,
    "y_pred": (proba_val >= thr_val).astype(int)
}).to_parquet(val_preds_path, index=False)
print(f"[BASELINE] Mejor umbral (val) por F1: {thr_val:.3f} | F1(val)={best_f1_val:.4f}")

val_metrics = compute_all_metrics(y_val, proba_val, thr_val)
print("[BASELINE] Métricas val:", {k: (round(v,4) if isinstance(v,float) else v) for k,v in val_metrics.items()})

baseline = model
base_best_it = best_epoch
tuned_model = None
best_params = None

[BASELINE] best_epoch: 21
[BASELINE] Mejor umbral (val) por F1: 0.814 | F1(val)=0.6270
[BASELINE] Métricas val: {'pr_auc': 0.6795, 'roc_auc': 0.8473, 'precision': 0.6967, 'f1': 0.627, 'recall': 0.57, 'bal_acc': 0.7533}


8 — Optimización incremental (Optuna) sobre AP(val)

In [15]:
import optuna
from optuna.samplers import TPESampler

N_TRIALS = 40
STUDY_NAME = f"DL_{VIEW_TAG}_{BAL_TAG}_AP"
SAMPLER = TPESampler(seed=RANDOM_STATE, multivariate=True, group=False)
study = optuna.create_study(direction="maximize", study_name=STUDY_NAME, sampler=SAMPLER)

def suggest_heads_for_dim(trial, d_model):

    candidates = [2, 4, 8]
    opts = [h for h in candidates if d_model % h == 0 and h <= d_model]
    if not opts:
        opts = [1]
    return trial.suggest_categorical("n_heads", opts)

SEARCH_KEYS = [
    "d_model","n_heads","n_layers","lstm_hidden","cnn_channels","kernel_size",
    "dropout","lr","weight_decay","batch_size"
]

def suggest_dl_params(trial):
    hp = {}
    d_model = trial.suggest_categorical("d_model", [32, 48, 64])
    hp["d_model"] = d_model
    hp["n_heads"] = suggest_heads_for_dim(trial, d_model)
    hp["n_layers"] = trial.suggest_int("n_layers", 1, 3)
    hp["lstm_hidden"] = trial.suggest_categorical("lstm_hidden", [32, 64, 96, 128])
    hp["cnn_channels"] = trial.suggest_categorical("cnn_channels", [32, 64, 96, 128])
    hp["kernel_size"] = trial.suggest_categorical("kernel_size", [3, 5])
    hp["dropout"] = trial.suggest_float("dropout", 0.0, 0.5)
    hp["lr"] = trial.suggest_float("lr", 3e-4, 3e-3, log=True)
    hp["weight_decay"] = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    hp["batch_size"] = trial.suggest_categorical("batch_size", [128, 256, 512])
    hp["epochs"] = seed_params.get("epochs", 100)
    hp["patience"] = seed_params.get("patience", 12)
    hp["random_state"] = RANDOM_STATE
    hp["_did_smote"] = did_smote_flag
    return hp

def objective(trial):
    hp = suggest_dl_params(trial)
    mdl = make_model(n_features=X_train_final.shape[1], hp=hp)
    mdl, best_ep, best_ap = train_one(mdl, X_train_final, y_train_final, X_val_fit, y_val, hp, verbose=False)
    proba_val_t = predict_proba(mdl, X_val_fit)
    ap = average_precision_score(y_val, proba_val_t)
    trial.set_user_attr("best_epoch", best_ep)
    return ap

# Warm-start con BEST
if BEST_HP_FILE.exists():
    try:
        prev = json.loads(BEST_HP_FILE.read_text())
        warm = {k: prev[k] for k in SEARCH_KEYS if k in prev}
        if warm:
            print("[OPTUNA] Enqueuing previous BEST as a trial seed.")
            study.enqueue_trial(warm)
    except Exception as e:
        print("[OPTUNA] Aviso: no se pudo usar BEST para warm-start:", e)

print(f"[OPTUNA] Iniciando estudio '{STUDY_NAME}' con {N_TRIALS} pruebas...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

best = study.best_trial
print(f"[OPTUNA] Mejor AP(val): {best.value:.6f}")
print(f"[OPTUNA] Params ganadores:", best.params)
print(f"[OPTUNA] best_epoch (del trial):", best.user_attrs.get("best_epoch"))

best_params = dict(best.params)
best_params.update({
    "epochs": seed_params.get("epochs", 100),
    "patience": seed_params.get("patience", 12),
    "random_state": RANDOM_STATE,
    "_did_smote": did_smote_flag
})
with open(BEST_HP_FILE, "w", encoding="utf-8") as f:
    json.dump(best_params, f, indent=2, ensure_ascii=False)
print("[OPTUNA] Guardado BEST en:", BEST_HP_FILE.name)

tuned_model = make_model(n_features=X_train_final.shape[1], hp=best_params)
tuned_model, best_ep = train_one(tuned_model, X_train_final, y_train_final, X_val_fit, y_val, best_params, verbose=False)[:2]
print("[OPTUNA] Reentreno final completado. best_epoch =", best_ep)

[I 2025-12-10 22:54:23,769] A new study created in memory with name: DL_FULL_SMOTENC_AP


[OPTUNA] Enqueuing previous BEST as a trial seed.
[OPTUNA] Iniciando estudio 'DL_FULL_SMOTENC_AP' con 40 pruebas...


[I 2025-12-10 22:55:49,687] Trial 0 finished with value: 0.6757454775056062 and parameters: {'d_model': 48, 'n_heads': 2, 'n_layers': 1, 'lstm_hidden': 32, 'cnn_channels': 64, 'kernel_size': 3, 'dropout': 0.19430284891752264, 'lr': 0.000810001430602943, 'weight_decay': 1.0315635803757857e-05, 'batch_size': 128}. Best is trial 0 with value: 0.6757454775056062.
[I 2025-12-10 22:56:53,611] Trial 1 finished with value: 0.6758043511711931 and parameters: {'d_model': 48, 'n_heads': 2, 'n_layers': 1, 'lstm_hidden': 32, 'cnn_channels': 32, 'kernel_size': 5, 'dropout': 0.2623782158161189, 'lr': 0.0008110848199986004, 'weight_decay': 1.461896279370496e-05, 'batch_size': 128}. Best is trial 1 with value: 0.6758043511711931.
[I 2025-12-10 22:58:33,050] Trial 2 finished with value: 0.6622112017235411 and parameters: {'d_model': 64, 'n_heads': 8, 'n_layers': 1, 'lstm_hidden': 128, 'cnn_channels': 32, 'kernel_size': 3, 'dropout': 0.06101911742238941, 'lr': 0.0009382059110341113, 'weight_decay': 1.372

[OPTUNA] Mejor AP(val): 0.685790
[OPTUNA] Params ganadores: {'d_model': 48, 'n_heads': 2, 'n_layers': 3, 'lstm_hidden': 96, 'cnn_channels': 32, 'kernel_size': 3, 'dropout': 0.330222144734749, 'lr': 0.0006202506844188844, 'weight_decay': 0.0007018196903507625, 'batch_size': 128}
[OPTUNA] best_epoch (del trial): 30
[OPTUNA] Guardado BEST en: BEST_DL_FULL_SMOTENC.json
[OPTUNA] Reentreno final completado. best_epoch = 31


9 — Cross-Validation (OOF) para baseline y tuned (DL)

In [17]:
def run_oof_cv_dl(model_hp, X, y, k_folds=CV_FOLDS, seed=RANDOM_STATE, exp_suffix="BASELINE"):
    cv_tag = f"{EXP_NAME}_{exp_suffix}_CV{k_folds}"

    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    oof_proba = np.zeros_like(y, dtype=float)
    fold_rows = []

    for f, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr0, X_va0 = X[tr_idx], X[va_idx]
        y_tr0, y_va0 = y[tr_idx], y[va_idx]

        X_tr, y_tr = X_tr0, y_tr0
        did_smote = False
        if BALANCE_IN_CV and USE_BALANCED_TRAIN:
            X_tr, y_tr = maybe_resample(X_tr0, y_tr0)
            did_smote = True

        hp = dict(model_hp)
        hp["_did_smote"] = did_smote
        mdl = make_model(n_features=X.shape[1], hp=hp)
        adapter, best_ep, _ = train_one(mdl, X_tr, y_tr, X_va0, y_va0, hp, verbose=False)
        proba_va = predict_proba(adapter, X_va0)
        oof_proba[va_idx] = proba_va

        fold_tag = f"preds_val_fold{f}_{cv_tag}.parquet"
        pd.DataFrame({
            "idx": va_idx,
            "proba": proba_va,
            "y_true": y_va0
        }).to_parquet(OUT_PREDS / fold_tag, index=False)

        fold_rows.append({
            "fold": f,
            "pr_auc": average_precision_score(y_va0, proba_va),
            "roc_auc": roc_auc_score(y_va0, proba_va),
            "best_iteration": best_ep if best_ep is not None else np.nan
        })

    oof_pr = average_precision_score(y, oof_proba)
    oof_roc = roc_auc_score(y, oof_proba)
    thr_oof, _ = find_best_threshold(y, oof_proba, metric="f1")
    y_oof_pred = (oof_proba >= thr_oof).astype(int)
    oof_f1  = f1_score(y, y_oof_pred, zero_division=0)
    oof_rec = recall_score(y, y_oof_pred, zero_division=0)
    oof_bal = balanced_accuracy_score(y, y_oof_pred)

    cv_csv = OUT_RESULTS / f"cv_summary_{cv_tag}.csv"
    folds_df = pd.DataFrame(fold_rows)
    agg_row = pd.DataFrame([{
        "fold": "OOF", "pr_auc": oof_pr, "roc_auc": oof_roc,
        "thr": thr_oof, "f1": oof_f1, "recall": oof_rec, "bal_acc": oof_bal
    }])
    pd.concat([folds_df, agg_row], ignore_index=True).to_csv(cv_csv, index=False)

    oof_path = OUT_PREDS / f"oof_{cv_tag}.parquet"
    pd.DataFrame({"oof_proba": oof_proba, "y_true": y}).to_parquet(oof_path, index=False)
    print(f"[CV-{exp_suffix}] Guardados: {cv_csv.name} | {oof_path.name}")

    return {
        "oof_pr_auc": oof_pr,
        "oof_roc_auc": oof_roc,
        "thr": thr_oof,
        "oof_f1": oof_f1,
        "oof_recall": oof_rec,
        "oof_bal_acc": oof_bal
    }

cv_baseline = None
cv_tuned = None

if DO_CV_BASELINE:
    cv_baseline = run_oof_cv_dl(base_hp, X_train_fit, y_train, exp_suffix="BASELINE")

if DO_CV_TUNED and "d_model" in (best_params or {}):
    cv_tuned = run_oof_cv_dl(best_params, X_train_fit, y_train, exp_suffix="TUNED")

[CV-BASELINE] Guardados: cv_summary_DL_FULL_SMOTENC_BASELINE_CV5.csv | oof_DL_FULL_SMOTENC_BASELINE_CV5.parquet
[CV-TUNED] Guardados: cv_summary_DL_FULL_SMOTENC_TUNED_CV5.csv | oof_DL_FULL_SMOTENC_TUNED_CV5.parquet


10 — Evaluación en test + guardados (curvas, importancias proxy, preds, baselines)

In [18]:
base = EXP_NAME

# BASELINE
proba_test = predict_proba(model, X_test_fit)
y_pred_test = (proba_test >= thr_val).astype(int)
test_metrics = compute_all_metrics(y_test, proba_test, thr_val)

# Guardar HP baseline
params_seed_path = OUT_PARAMS / f"{base}_BASE_seed_params.json"
with open(params_seed_path, "w", encoding="utf-8") as f:
    json.dump(base_hp, f, indent=2, ensure_ascii=False)

params_fitted_path = OUT_PARAMS / f"{base}_BASE_fitted_params.json"
with open(params_fitted_path, "w", encoding="utf-8") as f:
    json.dump(base_hp, f, indent=2, ensure_ascii=False)

# Figuras baseline
plot_pr_curve(y_val,  proba_val,  f"{base} — PR (val)",  OUT_FIGS / f"{base}_pr_val.png")
plot_pr_curve(y_test, proba_test, f"{base} — PR (test)", OUT_FIGS / f"{base}_pr_test.png")
plot_roc_curve(y_val,  proba_val,  f"{base} — ROC (val)",  OUT_FIGS / f"{base}_roc_val.png")
plot_roc_curve(y_test, proba_test, f"{base} — ROC (test)", OUT_FIGS / f"{base}_roc_test.png")
plot_confusion(y_test, y_pred_test, f"{base} — Confusion (test @thr={thr_val:.3f})", OUT_FIGS / f"{base}_cm_test.png")

# Importancias proxy
try:
    imp_df = model.feature_importance(feature_names_used)
except Exception:
    imp_df = pd.DataFrame({"feature": feature_names_used, "importance_proxy": np.zeros(len(feature_names_used))})
imp_path = OUT_RESULTS / f"{base}_feature_importances.csv"
imp_df.to_csv(imp_path, index=False)

# Preds test baseline
preds_path = OUT_PREDS / f"preds_test_{base}.parquet"
pd.DataFrame({"proba": proba_test, "y_true": y_test}).to_parquet(preds_path, index=False)

row_base = {
    "model": base,
    "thr_val": thr_val,
    "val_pr_auc": val_metrics["pr_auc"],
    "val_roc_auc": val_metrics["roc_auc"],
    "val_precision": val_metrics["precision"],
    "val_f1": val_metrics["f1"],
    "val_recall": val_metrics["recall"],
    "val_bal_acc": val_metrics["bal_acc"],
    "test_pr_auc": test_metrics["pr_auc"],
    "test_roc_auc": test_metrics["roc_auc"],
    "test_precision": test_metrics["precision"],
    "test_f1": test_metrics["f1"],
    "test_recall": test_metrics["recall"],
    "test_bal_acc": test_metrics["bal_acc"],
    "best_iteration": base_best_it if base_best_it is not None else np.nan
}
res_csv = OUT_RESULTS / "baselines.csv"
pd.DataFrame([row_base]).to_csv(res_csv, mode=("a" if res_csv.exists() else "w"), index=False, header=not res_csv.exists())

print("[OK][BASE] Guardados:\n  - Seed HPs   :", params_seed_path.name,
      "\n  - Fitted HPs :", params_fitted_path.name,
      "\n  - Importancias:", imp_path.name,
      "\n  - Preds test  :", preds_path.name,
      "\n  - Baselines   :", res_csv.name)

# TUNED
if tuned_model is not None and best_params is not None:
    proba_val_tuned = predict_proba(tuned_model, X_val_fit)
    thr_val_tuned, _ = find_best_threshold(y_val, proba_val_tuned, metric="f1")
    # Guardar predicciones de validación (tuned)
    val_tuned_path = OUT_PREDS / f"preds_val_{base}_TUNED.parquet"
    pd.DataFrame({
        "proba": proba_val_tuned,
        "y_true": y_val,
        "y_pred": (proba_val_tuned >= thr_val_tuned).astype(int)
    }).to_parquet(val_tuned_path, index=False)
    val_metrics_tuned = compute_all_metrics(y_val, proba_val_tuned, thr_val_tuned)

    proba_test_tuned = predict_proba(tuned_model, X_test_fit)
    y_pred_test_tuned = (proba_test_tuned >= thr_val_tuned).astype(int)
    test_metrics_tuned = compute_all_metrics(y_test, proba_test_tuned, thr_val_tuned)

    tuned_fitted_path = OUT_PARAMS / f"{base}_TUNED_fitted_params.json"
    with open(tuned_fitted_path, "w", encoding="utf-8") as f:
        json.dump(best_params, f, indent=2, ensure_ascii=False)

    base_t = base + "_TUNED"
    plot_pr_curve(y_val,  proba_val_tuned,  f"{base_t} — PR (val)",  OUT_FIGS / f"{base_t}_pr_val.png")
    plot_pr_curve(y_test, proba_test_tuned, f"{base_t} — PR (test)", OUT_FIGS / f"{base_t}_pr_test.png")
    plot_roc_curve(y_val,  proba_val_tuned,  f"{base_t} — ROC (val)",  OUT_FIGS / f"{base_t}_roc_val.png")
    plot_roc_curve(y_test, proba_test_tuned, f"{base_t} — ROC (test)", OUT_FIGS / f"{base_t}_roc_test.png")
    plot_confusion(y_test, y_pred_test_tuned, f"{base_t} — Confusion (test @thr={thr_val_tuned:.3f})", OUT_FIGS / f"{base_t}_cm_test.png")

    try:
        imp_t_df = tuned_model.feature_importance(feature_names_used)
    except Exception:
        imp_t_df = pd.DataFrame({"feature": feature_names_used, "importance_proxy": np.zeros(len(feature_names_used))})
    imp_t_path = OUT_RESULTS / f"{base_t}_feature_importances.csv"
    imp_t_df.to_csv(imp_t_path, index=False)

    preds_t_path = OUT_PREDS / f"preds_test_{base_t}.parquet"
    pd.DataFrame({"proba": proba_test_tuned, "y_true": y_test}).to_parquet(preds_t_path, index=False)

    row_t = {
        "model": base_t,
        "thr_val": thr_val_tuned,
        "val_pr_auc": val_metrics_tuned["pr_auc"],
        "val_roc_auc": val_metrics_tuned["roc_auc"],
        "val_precision": val_metrics_tuned["precision"],
        "val_f1": val_metrics_tuned["f1"],
        "val_recall": val_metrics_tuned["recall"],
        "val_bal_acc": val_metrics_tuned["bal_acc"],
        "test_pr_auc": test_metrics_tuned["pr_auc"],
        "test_roc_auc": test_metrics_tuned["roc_auc"],
        "test_precision": test_metrics_tuned["precision"],
        "test_f1": test_metrics_tuned["f1"],
        "test_recall": test_metrics_tuned["recall"],
        "test_bal_acc": test_metrics_tuned["bal_acc"],
        "best_iteration": best_ep if best_ep is not None else np.nan
    }
    pd.DataFrame([row_t]).to_csv(res_csv, mode="a", index=False, header=False)

    print("[OK][TUNED] Guardados:\n  - Fitted HPs :", tuned_fitted_path.name,
          "\n  - Importancias:", imp_t_path.name,
          "\n  - Preds test  :", preds_t_path.name,
          "\n  - Baselines   :", res_csv.name)

[OK][BASE] Guardados:
  - Seed HPs   : DL_FULL_SMOTENC_BASE_seed_params.json 
  - Fitted HPs : DL_FULL_SMOTENC_BASE_fitted_params.json 
  - Importancias: DL_FULL_SMOTENC_feature_importances.csv 
  - Preds test  : preds_test_DL_FULL_SMOTENC.parquet 
  - Baselines   : baselines.csv
[OK][TUNED] Guardados:
  - Fitted HPs : DL_FULL_SMOTENC_TUNED_fitted_params.json 
  - Importancias: DL_FULL_SMOTENC_TUNED_feature_importances.csv 
  - Preds test  : preds_test_DL_FULL_SMOTENC_TUNED.parquet 
  - Baselines   : baselines.csv


11 — Mejores resultados + resumen CV (formato similar a tu XGB)

In [19]:
AGGREGATE_ALL_RUNS = False

def safe(v, fmt=".4f"):
    try:
        return f"{float(v):{fmt}}"
    except Exception:
        return "NA"

base_csv = OUT_RESULTS / "baselines.csv"
if not base_csv.exists():
    raise FileNotFoundError(f"No existe {base_csv}")

df = pd.read_csv(base_csv)

needed = [
    "model","thr_val",
    "val_pr_auc","val_roc_auc","val_precision","val_f1","val_recall","val_bal_acc",
    "test_pr_auc","test_roc_auc","test_precision","test_f1","test_recall","test_bal_acc",
    "best_iteration"
]
for c in needed:
    if c not in df.columns:
        df[c] = pd.NA 

df = df[needed].copy()

num_cols = [c for c in needed if c not in ("model",)]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

if AGGREGATE_ALL_RUNS:
    root_art = ARTIF_DIR.parent
    for p in (root_art).glob("DL_*/results/baselines.csv"):
        if p == base_csv:
            continue
        try:
            d2 = pd.read_csv(p)
            for c in needed:
                if c not in d2.columns:
                    d2[c] = pd.NA
            d2 = d2[needed]
            for c in num_cols:
                d2[c] = pd.to_numeric(d2[c], errors="coerce")
            df = pd.concat([df, d2], ignore_index=True)
        except Exception:
            pass

if df.empty:
    raise ValueError("El dataframe de resultados está vacío.")

df = df.drop_duplicates(subset=["model"], keep="last").copy()

def best_by(metric):
    if metric not in df.columns or df[metric].dropna().empty:
        return None
    r = df.loc[df[metric].idxmax()]
    print(
        f"- {metric}: {r['model']} | "
        f"PR-AUC={safe(r['test_pr_auc'])} | "
        f"ROC-AUC={safe(r['test_roc_auc'])} | "
        f"F1={safe(r['test_f1'])} | "
        f"Recall={safe(r['test_recall'])} | "
        f"Precision={safe(r['test_precision'])} | "
        f"thr(val)={safe(r['thr_val'], '.3f')} | "
        f"best_iter={int(r['best_iteration']) if pd.notna(r['best_iteration']) else 'NA'}"
    )
    return r

print("=== MEJORES EN TEST (por métrica) ===")
winners = {}
for m in ["test_pr_auc","test_roc_auc","test_recall","test_f1","test_precision"]:
    w = best_by(m)
    if w is not None:
        winners[m] = w

cv_files = list(OUT_RESULTS.glob("cv_summary_*_CV*.csv"))
if cv_files:
    print("=== RESUMEN CV-OOF (por experimento) ===")
    rows = []
    for f in cv_files:
        tag = re.sub(r"^cv_summary_|\.csv$", "", f.name)
        cv = pd.read_csv(f)
        oof = cv.loc[cv["fold"] == "OOF"]
        if not oof.empty:
            r = oof.iloc[0]
            rows.append({
                "tag": tag,
                "pr_auc": r.get("pr_auc"),
                "roc_auc": r.get("roc_auc"),
                "f1": r.get("f1"),
                "recall": r.get("recall"),
                "bal_acc": r.get("bal_acc"),
                "thr": r.get("thr"),
            })
    if rows:
        print(pd.DataFrame(rows).sort_values(["pr_auc","roc_auc"], ascending=False).to_string(index=False))
else:
    print("(No se hallaron archivos de CV para este experimento)")

# Normalización/backup
backup = OUT_RESULTS / "baselines_legacy_backup.csv"
base_csv.replace(backup)
df.to_csv(base_csv, index=False)
print("[OK] Normalizado. Backup:", backup.name)

=== MEJORES EN TEST (por métrica) ===
- test_pr_auc: DL_FULL_SMOTENC_TUNED | PR-AUC=0.7045 | ROC-AUC=0.8584 | F1=0.6183 | Recall=0.6069 | Precision=0.6301 | thr(val)=0.710 | best_iter=31
- test_roc_auc: DL_FULL_SMOTENC_TUNED | PR-AUC=0.7045 | ROC-AUC=0.8584 | F1=0.6183 | Recall=0.6069 | Precision=0.6301 | thr(val)=0.710 | best_iter=31
- test_recall: DL_FULL_SMOTENC_TUNED | PR-AUC=0.7045 | ROC-AUC=0.8584 | F1=0.6183 | Recall=0.6069 | Precision=0.6301 | thr(val)=0.710 | best_iter=31
- test_f1: DL_FULL_SMOTENC_TUNED | PR-AUC=0.7045 | ROC-AUC=0.8584 | F1=0.6183 | Recall=0.6069 | Precision=0.6301 | thr(val)=0.710 | best_iter=31
- test_precision: DL_FULL_SMOTENC | PR-AUC=0.6995 | ROC-AUC=0.8515 | F1=0.6069 | Recall=0.5405 | Precision=0.6918 | thr(val)=0.814 | best_iter=21
=== RESUMEN CV-OOF (por experimento) ===
                         tag   pr_auc  roc_auc       f1   recall  bal_acc   thr
   DL_FULL_SMOTENC_TUNED_CV5 0.676418 0.845732 0.608551 0.599346 0.752258 0.683
DL_FULL_SMOTENC_BASELI