# Proyecto 2 — Pipeline con **RFE** (ULTRA-RÁPIDO)
Versión optimizada para tiempo: **CV=2**, grids mínimos, y RFE con pocas opciones.
- Mantiene **RFE** en los 4 modelos y **SMOTE dentro de CV** (sin fugas).
- Pensado para terminar en ~10–15 minutos en CPU promedio.

**Entrada:** `creditcard.csv` en la misma carpeta.


## 1) Setup e importaciones

In [11]:
# !pip install -q scikit-learn imbalanced-learn pandas numpy
import os, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
RANDOM_STATE = 42
CV_FOLDS = 2


## 2) Utilidades

In [13]:
def cargar_datos(path_csv: str = "dataset/creditcard.csv") -> pd.DataFrame:
    df = pd.read_csv(path_csv)
    return df

def construir_preprocesador(feature_names):
    base_cols = ["Time", "Amount"]
    pca_cols = [c for c in feature_names if c.startswith("V")]
    return ColumnTransformer([
    ('all_features', StandardScaler(), base_cols + pca_cols)
])

def metricas_desde_confusion(cm):
    tn, fp, fn, tp = cm.ravel()
    acc = (tp + tn) / (tp + tn + fp + fn + 1e-12)
    sensibilidad = tp / (tp + fn + 1e-12)
    especificidad = tn / (tn + fp + 1e-12)
    precision = tp / (tp + fp + 1e-12)
    f1 = 2 * precision * sensibilidad / (precision + sensibilidad + 1e-12)
    return dict(exactitud=acc, sensibilidad=sensibilidad, especificidad=especificidad,
                precision=precision, f1=f1, tp=int(tp), fp=int(fp), tn=int(tn), fn=int(fn))

def evaluar_modelo(nombre, y_true, y_pred, y_score=None):
    print(f"\n=== {nombre} ===")
    print(classification_report(y_true, y_pred, digits=4))
    cm = confusion_matrix(y_true, y_pred)
    print("Matriz de confusión:\n", cm)
    resumen = metricas_desde_confusion(cm)
    if y_score is not None:
        try:
            auc = roc_auc_score(y_true, y_score)
            resumen["roc_auc"] = auc
            print(f"ROC-AUC: {auc:.4f}")
        except Exception as e:
            print("No fue posible calcular ROC-AUC:", e)
    return resumen


## 3) Pipelines con RFE (grids mínimos, CV=2)

In [14]:
from sklearn.pipeline import Pipeline

def pipeline_knn_rfe(feature_names):
    pre = construir_preprocesador(feature_names)
    rfe_est = LogisticRegression(solver="liblinear", max_iter=200, random_state=RANDOM_STATE)
    rfe = RFE(estimator=rfe_est, n_features_to_select=3, step=5)  # step aumentado a 5 (antes 2) para acelerar RFE
    knn = KNeighborsClassifier()
    pipe = ImbPipeline([("pre", pre), ("smote", SMOTE(random_state=RANDOM_STATE, k_neighbors=5)), ("rfe", rfe), ("clf", knn)])
    param_dist = {"rfe__n_features_to_select": [10, 15], "clf__n_neighbors": [5], "clf__metric": ["euclidean"]}
    # Modificación: usar RandomizedSearchCV (n_iter=2) en lugar de GridSearchCV para acelerar la búsqueda
    # Espacio de búsqueda reducido: métrica solo 'euclidean' (antes también 'manhattan'),
    # n_neighbors fijo en 5, y RFE probando con 10 o 15 características en lugar de 16 o 20.
    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    return RandomizedSearchCV(pipe, param_dist, scoring="f1", cv=cv, n_iter=2, random_state=RANDOM_STATE, n_jobs=-1, refit=True, verbose=1)

def pipeline_arbol_rfe(feature_names):
    pre = construir_preprocesador(feature_names)
    rfe = RFE(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=15), n_features_to_select=3, step=5)  # step aumentado a 5 (antes 2), limitamos profundidad a 15 en RFE
    dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
    pipe = ImbPipeline([("pre", pre), ("smote", SMOTE(random_state=RANDOM_STATE, k_neighbors=5)), ("rfe", rfe), ("clf", dt)])
    param_dist = {"rfe__n_features_to_select": [10, 15], "clf__max_depth": [15], "clf__min_samples_leaf": [1, 2]}
    # Modificación: usar RandomizedSearchCV (n_iter=3) en lugar de GridSearchCV para acelerar la búsqueda
    # Espacio de búsqueda reducido: max_depth fijo en 15 (antes se probaba None y 15),
    # y RFE probando con 10 o 15 características en lugar de 16 o 20 (min_samples_leaf sigue en [1,2]).
    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    return RandomizedSearchCV(pipe, param_dist, scoring="f1", cv=cv, n_iter=3, random_state=RANDOM_STATE, n_jobs=-1, refit=True, verbose=1)

def pipeline_svm_rfe(feature_names):
    pre = construir_preprocesador(feature_names)
    rfe = RFE(estimator=LinearSVC(random_state=RANDOM_STATE, tol=1e-3, C=1.0), n_features_to_select=3, step=5)  # step aumentado a 5 (antes 2) para acelerar RFE
    svm = LinearSVC(random_state=RANDOM_STATE)
    pipe = ImbPipeline([("pre", pre), ("smote", SMOTE(random_state=RANDOM_STATE, k_neighbors=5)), ("rfe", rfe), ("clf", svm)])
    param_dist = {"rfe__n_features_to_select": [10, 15], "clf__C": [1], "clf__tol": [1e-3]}
    # Modificación: usar RandomizedSearchCV (n_iter=2) en lugar de GridSearchCV para acelerar la búsqueda
    # Espacio de búsqueda reducido: tol fijo en 1e-3 (antes 1e-3 y 1e-4),
    # y RFE probando con 10 o 15 características en lugar de 16 o 20 (C=1 fijo).
    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    return RandomizedSearchCV(pipe, param_dist, scoring="f1", cv=cv, n_iter=2, random_state=RANDOM_STATE, n_jobs=-1, refit=True, verbose=1)

def pipeline_rf_rfe(feature_names):
    pre = construir_preprocesador(feature_names)
    rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=15, random_state=RANDOM_STATE), n_features_to_select=3, step=5)  # step aumentado a 5, n_estimators=100 y max_depth=15 en RFE
    rf = RandomForestClassifier(random_state=RANDOM_STATE)
    pipe = ImbPipeline([("pre", pre), ("smote", SMOTE(random_state=RANDOM_STATE, k_neighbors=5)), ("rfe", rfe), ("clf", rf)])
    param_dist = {"rfe__n_features_to_select": [10, 15], "clf__n_estimators": [100], "clf__max_depth": [15], "clf__min_samples_leaf": [1, 2]}
    # Modificación: usar RandomizedSearchCV (n_iter=3) en lugar de GridSearchCV para acelerar la búsqueda
    # Espacio de búsqueda reducido: n_estimators fijo en 100 (antes 200) y max_depth fijo en 15 (antes None y 15),
    # y RFE probando con 10 o 15 características en lugar de 16 o 20 (min_samples_leaf sigue [1,2]).
    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    return RandomizedSearchCV(pipe, param_dist, scoring="f1", cv=cv, n_iter=3, random_state=RANDOM_STATE, n_jobs=-1, refit=True, verbose=1)


## 4) Carga y split 70/30

In [15]:
# === Cargar datos y split ===
path_csv = "dataset/creditcard.csv"
if not os.path.exists(path_csv):
    raise FileNotFoundError("No se encontró creditcard.csv (Kaggle: mlg-ulb/creditcardfraud).")
df = cargar_datos(path_csv)
feature_cols = ["Time", "Amount"] + [f"V{i}" for i in range(1, 29)]
X = df[feature_cols].copy()
y = df["Class"].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE)
print("Información del dataset:")
print(f"Dimensiones: {df.shape}")
print(f"Distribución de clases: {df['Class'].value_counts()}")
print(f"Porcentaje de fraude: {df['Class'].mean()*100:.4f}%")
print(f"Valores nulos: {df.isnull().sum().sum()}")
print(f"Fraude total: {y.mean()*100:.4f}% | Train: {y_train.mean()*100:.4f}% | Test: {y_test.mean()*100:.4f}%")


Información del dataset:
Dimensiones: (284807, 31)
Distribución de clases: Class
0    284315
1       492
Name: count, dtype: int64
Porcentaje de fraude: 0.1727%
Valores nulos: 0
Fraude total: 0.1727% | Train: 0.1725% | Test: 0.1732%


## 5) Entrenamiento + evaluación + resumen

In [16]:
def extraer_caracteristicas(grid, nombre_modelo):
    best_pipe = grid.best_estimator_
    rfe = best_pipe.named_steps["rfe"]
    seleccionadas = [f for f, s in zip(feature_cols, rfe.support_) if s]
    print(f"\n {nombre_modelo}:")
    print("Características seleccionadas:", seleccionadas)
    return seleccionadas



# *Tuning con muestra + RFE acelerado*
- Usa **una muestra estratificada** del *train* para **tuning** (RFE + hiperparámetros).
- Usa **RandomizedSearchCV** con **grids compactos**.
- **RFE con `step=5`**
- **SMOTE dentro de CV**
- Luego **reentrena el mejor pipeline** con **todo el conjunto de entrenamiento** y evalúa en *test*.

> Si ya tienes definidas variables como `X_train, X_test, y_train, y_test`, y `feature_cols`, esta sección las reutiliza.


In [17]:

# =========================
# Configuración de tuning reducido
# =========================
FRACTION_TUNING = 0.20     # 20% del train para búsquedas
RANDOM_STATE    = 42
CV_FOLDS        = 2        # rápido y razonable

import numpy as np, pandas as pd, os
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Utilidades
def construir_preprocesador(feature_names):
    base_cols = ["Time", "Amount"]
    pca_cols  = [c for c in feature_names if c.startswith("V")]
    return ColumnTransformer([('num', StandardScaler(), base_cols + pca_cols)])

def stratified_sample_for_tuning(X, y, fraction=FRACTION_TUNING, random_state=RANDOM_STATE):
    X_tune, _, y_tune, _ = train_test_split(
        X, y, train_size=fraction, stratify=y, random_state=random_state
    )
    print(f"[TUNING] Usando {len(y_tune)} ejemplos ({fraction*100:.0f}%) para búsqueda")
    return X_tune, y_tune

def evaluar_modelo(nombre, y_true, y_pred, y_score=None):
    print(f"\n=== {nombre} ===")
    print(classification_report(y_true, y_pred, digits=4))
    cm = confusion_matrix(y_true, y_pred)
    print("Matriz de confusión:\n", cm)
    try:
        if y_score is not None:
            auc = roc_auc_score(y_true, y_score)
            print(f"ROC-AUC: {auc:.4f}")
    except Exception:
        pass


In [18]:

# =========================
# Comprobación de prerequisitos
# =========================
# Se espera que ya existan: X_train, X_test, y_train, y_test, feature_cols
# Si no existen, intentamos construirlos desde creditcard.csv en ./dataset
need_data = any(v not in globals() for v in ["X_train","X_test","y_train","y_test","feature_cols"])

if need_data:
    print("[INFO] No se detectaron variables previas: creando split 70/30 desde dataset/creditcard.csv")
    import pandas as pd
    path_csv = "dataset/creditcard.csv"
    if not os.path.exists(path_csv):
        raise FileNotFoundError("No se encontró dataset/creditcard.csv. Descárgalo de Kaggle: mlg-ulb/creditcardfraud")
    df = pd.read_csv(path_csv)
    feature_cols = ["Time", "Amount"] + [f"V{i}" for i in range(1, 29)]
    X = df[feature_cols].copy()
    y = df["Class"].astype(int).values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
    )
    print("Split creado. Fraude en train/test:", y_train.mean(), y_test.mean())

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
pre = construir_preprocesador(feature_cols)
X_tune, y_tune = stratified_sample_for_tuning(X_train, y_train, FRACTION_TUNING, RANDOM_STATE)


[TUNING] Usando 39872 ejemplos (20%) para búsqueda


In [23]:
# =========================
# BÚSQUEDAS sobre la muestra (RFE + RandomizedSearchCV)
# =========================
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# --- k-NN ---
rfe_knn = RFE(
    estimator=LogisticRegression(solver="liblinear", max_iter=200, random_state=RANDOM_STATE),
    n_features_to_select=12, step=5
)
pipe_knn = ImbPipeline([
    ("pre",  pre),
    ("sm",   SMOTE(random_state=RANDOM_STATE, k_neighbors=5)),
    ("rfe",  rfe_knn),
    ("clf",  KNeighborsClassifier())
])
dist_knn = {
    "rfe__n_features_to_select": [10, 12, 15],
    "clf__n_neighbors": [5, 7],
    "clf__metric": ["euclidean"]
}
search_knn = RandomizedSearchCV(
    pipe_knn, dist_knn, n_iter=4, cv=cv, scoring="f1",
    n_jobs=-1, random_state=RANDOM_STATE, refit=True, verbose=1
)
search_knn.fit(X_tune, y_tune)

# --- Árbol ---
rfe_dt = RFE(
    estimator=DecisionTreeClassifier(max_depth=15, random_state=RANDOM_STATE),
    n_features_to_select=12, step=5
)
pipe_dt = ImbPipeline([
    ("pre", pre),
    ("sm",  SMOTE(random_state=RANDOM_STATE, k_neighbors=5)),
    ("rfe", rfe_dt),
    ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))
])
dist_dt = {
    "rfe__n_features_to_select": [10, 12, 15],
    "clf__max_depth": [15],
    "clf__min_samples_leaf": [1, 2]
}
search_dt = RandomizedSearchCV(
    pipe_dt, dist_dt, n_iter=4, cv=cv, scoring="f1",
    n_jobs=-1, random_state=RANDOM_STATE, refit=True, verbose=1
)
search_dt.fit(X_tune, y_tune)

# --- SVM lineal ---
rfe_svm = RFE(
    estimator=LinearSVC(random_state=RANDOM_STATE, tol=1e-3, C=1.0),
    n_features_to_select=12, step=5
)
pipe_svm = ImbPipeline([
    ("pre", pre),
    ("sm",  SMOTE(random_state=RANDOM_STATE, k_neighbors=5)),
    ("rfe", rfe_svm),
    ("clf", LinearSVC(random_state=RANDOM_STATE))
])
dist_svm = {
    "rfe__n_features_to_select": [10, 12, 15],
    "clf__C": [0.5, 1.0],
    "clf__tol": [1e-3]
}
search_svm = RandomizedSearchCV(
    pipe_svm, dist_svm, n_iter=3, cv=cv, scoring="f1",
    n_jobs=-1, random_state=RANDOM_STATE, refit=True, verbose=1
)
search_svm.fit(X_tune, y_tune)

# --- Random Forest ---
rfe_rf = RFE(
    estimator=RandomForestClassifier(n_estimators=100, max_depth=15, random_state=RANDOM_STATE),
    n_features_to_select=12, step=5
)
pipe_rf = ImbPipeline([
    ("pre", pre),
    ("sm",  SMOTE(random_state=RANDOM_STATE, k_neighbors=5)),
    ("rfe", rfe_rf),
    ("clf", RandomForestClassifier(random_state=RANDOM_STATE))
])
dist_rf = {
    "rfe__n_features_to_select": [15,18,20],
    "clf__n_estimators": [100],
    "clf__max_depth": [15],
    "clf__min_samples_leaf": [1, 2]
}
search_rf = RandomizedSearchCV(
    pipe_rf, dist_rf, n_iter=4, cv=cv, scoring="f1",
    n_jobs=-1, random_state=RANDOM_STATE, refit=True, verbose=1
)
search_rf.fit(X_tune, y_tune)

print("\nMejores params (tuning reducido):")
print("kNN:", search_knn.best_params_)
print("DT :", search_dt.best_params_)
print("SVM:", search_svm.best_params_)
print("RF :", search_rf.best_params_)


Fitting 2 folds for each of 4 candidates, totalling 8 fits
Fitting 2 folds for each of 4 candidates, totalling 8 fits
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Fitting 2 folds for each of 4 candidates, totalling 8 fits

Mejores params (tuning reducido):
kNN: {'rfe__n_features_to_select': 12, 'clf__n_neighbors': 5, 'clf__metric': 'euclidean'}
DT : {'rfe__n_features_to_select': 12, 'clf__min_samples_leaf': 1, 'clf__max_depth': 15}
SVM: {'rfe__n_features_to_select': 15, 'clf__tol': 0.001, 'clf__C': 1.0}
RF : {'rfe__n_features_to_select': 20, 'clf__n_estimators': 100, 'clf__min_samples_leaf': 2, 'clf__max_depth': 15}


In [25]:
mejores_caracteristicas = {
    "kNN": extraer_caracteristicas(search_knn, "k-NN"),
    "DT":  extraer_caracteristicas(search_dt, "Árbol"),
    "SVM": extraer_caracteristicas(search_svm, "SVM lineal"),
    "RF":  extraer_caracteristicas(search_rf, "Random Forest"),
}
len(mejores_caracteristicas["kNN"]), len(mejores_caracteristicas["DT"]), len(mejores_caracteristicas["SVM"]), len(mejores_caracteristicas["RF"])


 k-NN:
Características seleccionadas: ['V1', 'V3', 'V4', 'V7', 'V8', 'V9', 'V13', 'V14', 'V15', 'V18', 'V19', 'V28']

 Árbol:
Características seleccionadas: ['V1', 'V3', 'V4', 'V7', 'V14', 'V15', 'V16', 'V18', 'V19', 'V23', 'V26', 'V28']

 SVM lineal:
Características seleccionadas: ['V1', 'V2', 'V3', 'V4', 'V7', 'V8', 'V9', 'V13', 'V14', 'V15', 'V16', 'V18', 'V19', 'V20', 'V26']

 Random Forest:
Características seleccionadas: ['V1', 'V2', 'V3', 'V4', 'V5', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V21', 'V26', 'V28']


(12, 12, 15, 20)

In [None]:
# =========================
# Reentrenar cada mejor pipeline con TODO el train
# =========================
best_knn = search_knn.best_estimator_.set_params(**search_knn.best_params_)
best_dt  = search_dt.best_estimator_.set_params(**search_dt.best_params_)
best_svm = search_svm.best_estimator_.set_params(**search_svm.best_params_)
best_rf  = search_rf.best_estimator_.set_params(**search_rf.best_params_)

best_knn.fit(X_train, y_train)
best_dt.fit(X_train, y_train)
best_svm.fit(X_train, y_train)
best_rf.fit(X_train, y_train)

def preds_scores(model, X):
    try:
        s = model.predict_proba(X)[:, 1]
    except Exception:
        s = model.decision_function(X)
    return model.predict(X), s

y_pred_knn, s_knn = preds_scores(best_knn, X_test)
y_pred_dt,  s_dt  = preds_scores(best_dt,  X_test)
y_pred_svm, s_svm = preds_scores(best_svm, X_test)
y_pred_rf,  s_rf  = preds_scores(best_rf,  X_test)

# =========================
# Evaluar en test
# =========================

evaluar_modelo("kNN (final)", y_test, y_pred_knn, s_knn)
evaluar_modelo("Árbol (final)", y_test, y_pred_dt, s_dt)
evaluar_modelo("SVM lineal (final)", y_test, y_pred_svm, s_svm)
evaluar_modelo("Random Forest (final)", y_test, y_pred_rf, s_rf)



=== kNN (final) ===
              precision    recall  f1-score   support

           0     0.9997    0.9977    0.9987     85295
           1     0.3820    0.8311    0.5234       148

    accuracy                         0.9974     85443
   macro avg     0.6908    0.9144    0.7610     85443
weighted avg     0.9986    0.9974    0.9979     85443

Matriz de confusión:
 [[85096   199]
 [   25   123]]
ROC-AUC: 0.9216

=== Árbol (final) ===
              precision    recall  f1-score   support

           0     0.9996    0.9946    0.9971     85295
           1     0.1983    0.7703    0.3154       148

    accuracy                         0.9942     85443
   macro avg     0.5989    0.8824    0.6562     85443
weighted avg     0.9982    0.9942    0.9959     85443

Matriz de confusión:
 [[84834   461]
 [   34   114]]
ROC-AUC: 0.8841

=== SVM lineal (final) ===
              precision    recall  f1-score   support

           0     0.9998    0.9808    0.9902     85295
           1     0.0725    

In [None]:

# ===============================
# Split Train/Test + Evaluación
# ===============================
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import numpy as np

# ---- 1) Chequeos básicos
if 'X' not in globals() or 'y' not in globals():
    raise RuntimeError("No se encuentran las variables 'X' y 'y' en el entorno. Define X (features) e y (etiquetas) antes de ejecutar este bloque.")

# ---- 2) Split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("✅ Split listo ->",
      f"X_train: {getattr(X_train, 'shape', None)}, X_test: {getattr(X_test, 'shape', None)}",
      f"y_train: {np.shape(y_train)}, y_test: {np.shape(y_test)}")

# ---- 3) Preparar preprocesamiento (si existe)
preprocesador = None
for name in ['preprocesador_optimo', 'preprocesador', 'preprocess', 'preprocessor']:
    if name in globals():
        preprocesador = globals()[name]
        print(f"🔧 Usando preprocesador: {name}")
        break

def _to_dense(Xm):
    try:
        import scipy.sparse as sp
        return Xm.toarray() if sp.issparse(Xm) else Xm
    except Exception:
        return Xm

if preprocesador is not None:
    X_train_prep = preprocesador.fit_transform(X_train)
    X_test_prep  = preprocesador.transform(X_test)
else:
    # Si no hay preprocesador, se usan los datos tal cual
    X_train_prep, X_test_prep = X_train, X_test

# ---- 4) Detectar pipeline o modelo
modelo = None
pipeline_candidates = ['pipe_optimo', 'pipeline_optimo', 'pipe', 'pipeline', 'modelo_optimo']
for name in pipeline_candidates:
    if name in globals():
        obj = globals()[name]
        if hasattr(obj, 'fit') and hasattr(obj, 'predict'):
            modelo = obj
            print(f"🤖 Usando pipeline/modelo: {name}")
            break

if modelo is None:
    model_candidates = ['k-NN', 'dt', 'rf', 'svm', 'knn', 'clf', 'modelo', 'clasificador']
    for name in model_candidates:
        if name in globals():
            obj = globals()[name]
            if hasattr(obj, 'fit') and hasattr(obj, 'predict'):
                modelo = obj
                print(f"🤖 Usando modelo: {name}")
                break

if modelo is None:
    raise RuntimeError(
        "No se encontró un pipeline/modelo con métodos .fit/.predict. "
        "Crea una variable como 'pipe_optimo' o 'clf' antes de ejecutar este bloque."
    )

# ---- 5) Entrenar y evaluar
_ = modelo.fit(X_train_prep, y_train)
y_pred_tr = modelo.predict(X_train_prep)
y_pred_te = modelo.predict(X_test_prep)

def _maybe_proba(estimator, Xmat):
    # Intenta usar predict_proba o decision_function si existen
    if hasattr(estimator, "predict_proba"):
        try:
            proba = estimator.predict_proba(Xmat)
            if proba is not None and proba.ndim == 2 and proba.shape[1] >= 2:
                return proba[:, 1]
        except Exception:
            pass
    if hasattr(estimator, "decision_function"):
        try:
            scores = estimator.decision_function(Xmat)
            # convertir a [0,1] si es necesario, pero aquí dejamos raw scores para AUC
            return scores
        except Exception:
            pass
    return None

y_score_tr = _maybe_proba(modelo, X_train_prep)
y_score_te = _maybe_proba(modelo, X_test_prep)

def _metrics(y_true, y_pred, y_score=None, title=""):
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="binary" if len(np.unique(y_true))==2 else "macro")
    print(f"\n== {title} ==")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    if y_score is not None and len(np.unique(y_true))==2:
        try:
            auc = roc_auc_score(y_true, y_score)
            print(f"ROC AUC: {auc:.4f}")
        except Exception:
            pass
    print("\nReporte de clasificación:")
    print(classification_report(y_true, y_pred))

_metrics(y_train, y_pred_tr, y_score_tr, title="Métricas en TRAIN")
_metrics(y_test,  y_pred_te, y_score_te, title="Métricas en TEST")

print("\n✨ Listo. Conjunto de test agregado y evaluado.")


✅ Split listo -> X_train: (227845, 30), X_test: (56962, 30) y_train: (227845,), y_test: (56962,)
🔧 Usando preprocesador: preprocesador


RuntimeError: No se encontró un pipeline/modelo con métodos .fit/.predict. Crea una variable como 'pipe_optimo' o 'clf' antes de ejecutar este bloque.