In [1]:
# === Imports y configuración ===
import warnings, math, time
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pathlib import Path
from tempfile import mkdtemp
from joblib import Memory

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score, accuracy_score
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC, ADASYN
from imblearn.under_sampling import RandomUnderSampler

RANDOM_STATE = 42
SCORING = "average_precision"   # métrica de CV (PR-AUC), robusta en desbalance
memory = Memory(location=mkdtemp(), verbose=0)
np.random.seed(RANDOM_STATE)


In [2]:
# === Carga de datos limpios ===
# Usa la ruta que sabes que funciona en tu máquina:
data_path = Path(r"C:\Users\migue\Downloads\proyecto_final\proyecto_final\docs\data_limpia.pkl")

assert data_path.exists(), f"No se encuentra el archivo: {data_path}"
df_final = pd.read_pickle(data_path)

# === Mapear target a 0/1 si viene como string ===
if df_final["target"].dtype == "O":
    df_final["target"] = df_final["target"].map({"healthy": 0, "diseased": 1}).astype(int)

print("Shape:", df_final.shape)
print("Balance de clases:", df_final["target"].value_counts(normalize=True).round(3).to_dict())
df_final.head(2)


Shape: (100000, 38)
Balance de clases: {0: 0.701, 1: 0.299}


Unnamed: 0,age,bmi,waist_size,blood_pressure,cholesterol,heart_rate,glucose,insulin,sleep_hours,work_hours,...,insurance,family_history,sunlight_exposure,alcohol_consumption,smoking_level,education_level,job_type,caffeine_intake,pet_owner,target
0,56,18.915925,72.16513,118.264254,214.580523,60.749825,103.008176,16.070535,6.475885,7.671313,...,No,No,High,DS,Non-smoker,PhD,Tech,Moderate,Yes,0
1,69,36.716278,85.598889,117.917986,115.794002,66.463696,116.905134,10.131597,8.42841,9.515198,...,No,Yes,High,Regularly,Light,High School,Office,High,No,0


In [3]:
# === Split y columnas ===
y = df_final["target"].copy()
X = df_final.drop(columns=["target"]).copy()

# num: float/int; cat: object/category/bool
numerical_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols       = [c for c in X.columns if (pd.api.types.is_object_dtype(X[c]) 
                                           or pd.api.types.is_categorical_dtype(X[c]) 
                                           or pd.api.types.is_bool_dtype(X[c]))]

print(f"# Numéricas: {len(numerical_cols)} | # Categóricas: {len(cat_cols)}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)
print(X_train.shape, X_test.shape, y_train.mean().round(3), y_test.mean().round(3))

# CV estratificado (se usará en TODOS los grids)
cv_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


# Numéricas: 20 | # Categóricas: 17
(75000, 37) (25000, 37) 0.299 0.299


In [4]:
# === Utils: predict_proba, report, selección de umbral OOF (sin leakage) ===
from sklearn.model_selection import cross_val_predict

def predict_proba_safe(model, X):
    """Devuelve p(y=1) de forma robusta (predict_proba o decision_function escalada)."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        from sklearn.preprocessing import MinMaxScaler
        s = model.decision_function(X).reshape(-1, 1)
        return MinMaxScaler().fit_transform(s).ravel()
    raise AttributeError("El modelo no expone predict_proba ni decision_function.")

def report_results(model, X, y, title="Resultados", threshold=0.5):
    p = predict_proba_safe(model, X)
    y_hat = (p >= threshold).astype(int)
    print(f"\n=== {title} ===\n")
    print("Reporte de clasificación (test):")
    print(classification_report(y, y_hat, digits=4))
    print("Matriz de confusión (test):")
    print(confusion_matrix(y, y_hat))
    try:
        auc = roc_auc_score(y, p)
    except Exception:
        auc = np.nan
    ap = average_precision_score(y, p)
    print(f"ROC-AUC: {auc:.4f} | PR-AUC: {ap:.4f}")

def find_best_threshold_from_proba(y_true, p, metric="f1",
                                   t_grid=None, min_precision=None):
    """
    Elige t* maximizando F1 en probabilidades OOF (sin tocar test).
    - t_grid restringido evita extremos 0/1 que colapsan a 'todo 1'.
    - min_precision impone un mínimo de precisión al candidato t.
    """
    if t_grid is None:
        t_grid = np.linspace(0.05, 0.95, 91)  # evita 0 y 1
    best_t, best_v = None, -np.inf
    for t in t_grid:
        y_hat = (p >= t).astype(int)
        if min_precision is not None:
            if precision_score(y_true, y_hat, zero_division=0) < min_precision:
                continue
        if metric == "f1":
            v = f1_score(y_true, y_hat)
        else:
            raise ValueError("Solo 'f1' implementado.")
        if v > best_v:
            best_v, best_t = v, t
    return best_t, best_v

def fit_with_val_threshold(grid, X_train, y_train, metric="f1",
                           min_precision=0.20):
    """
    1) Ajusta GridSearchCV (usa cv=grid.cv ya definido como cv_final).
    2) Obtiene proba OOF del best_estimator_ por CV=grid.cv.
    3) Elige t* con F1 en OOF (sin tocar test), con restricción de precisión.
    4) Retorna (best_estimator, t*, dur_s, best_params, best_cv_score).
    """
    t0 = time.time()
    grid.fit(X_train, y_train)
    dur = time.time() - t0

    best_est = grid.best_estimator_
    p_oof = cross_val_predict(
        best_est, X_train, y_train,
        cv=grid.cv, method="predict_proba", n_jobs=-1
    )[:, 1]
    t_star, v_star = find_best_threshold_from_proba(
        y_true=y_train, p=p_oof, metric=metric, min_precision=min_precision
    )
    return best_est, t_star, dur, grid.best_params_, grid.best_score_


In [10]:
from sklearn.base import clone
from sklearn.model_selection import train_test_split

def fit_with_val_threshold_fast(grid, X_train, y_train,
                                metric="f1", min_precision=0.20,
                                val_size=0.2, random_state=42):
    """
    1) grid.fit(X_train,y_train) con CV=grid.cv (5 folds) -> tuning con PR-AUC (SCORING).
    2) Split interno train/val (hold-out) para elegir t* (sin tocar test).
    3) Reentrena el best_estimator_ en TODO el train y devuelve (modelo, t*, dur, params, cv_score).
    """
    t0 = time.time()
    grid.fit(X_train, y_train)
    dur = time.time() - t0

    best_est = grid.best_estimator_
    best_params = grid.best_params_
    best_cv = grid.best_score_

    # hold-out interno para umbral (rápido)
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=val_size, stratify=y_train, random_state=random_state
    )

    # entreno en sub-train
    est = clone(best_est)
    est.fit(X_tr, y_tr)

    # proba en val y búsqueda de t* (restringida y con min_precision)
    p_val = predict_proba_safe(est, X_val)
    t_grid = np.linspace(0.05, 0.95, 91)
    best_t, best_v = None, -np.inf
    for t in t_grid:
        y_hat = (p_val >= t).astype(int)
        if precision_score(y_val, y_hat, zero_division=0) < min_precision:
            continue
        v = f1_score(y_val, y_hat)
        if v > best_v:
            best_v, best_t = v, t

    # reentreno final en TODO el train
    best_est.fit(X_train, y_train)
    return best_est, best_t, dur, best_params, best_cv


In [5]:
# === Preprocesado esparso + techo SVD SOLO con X_train ===
ohe_sparse = OneHotEncoder(handle_unknown="ignore", sparse=True, min_frequency=0.01)
scaler_num_sparse_ok = StandardScaler(with_mean=False)

pre_sparse = ColumnTransformer(
    transformers=[
        ("num", scaler_num_sparse_ok, numerical_cols),
        ("cat", ohe_sparse,            cat_cols)
    ],
    remainder="drop",
    sparse_threshold=1.0
)

# Estimar #features del OHE SOLO con train
ohe_probe = OneHotEncoder(handle_unknown="ignore", sparse=True, min_frequency=0.01)
if len(cat_cols) > 0:
    ohe_probe.fit(X_train[cat_cols])
    n_cat_feats = int(sum(len(c) for c in ohe_probe.categories_))
else:
    n_cat_feats = 0

n_num_feats = len(numerical_cols)
n_feat = n_num_feats + n_cat_feats
svd_caps = [c for c in [30, 45, 60] if c < n_feat] or [max(2, min(30, n_feat-1))]

print("n_feat estimado:", n_feat, "| svd_caps:", svd_caps)


n_feat estimado: 77 | svd_caps: [30, 45, 60]


In [15]:
# === FIX: función global para densificar (evita PicklingError con joblib) ===
from scipy import sparse

def to_dense(X):
    # No uses lambda dentro del pipeline, define función global
    return X.toarray() if sparse.issparse(X) else X


In [17]:
# === Constructor de pipelines KNN (SVD + BallTree) por variante — FIX ADASYN ===
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC, ADASYN
from scipy import sparse

# Define 'to_dense' a nivel módulo (pickeable) si aún no existe
try:
    to_dense
except NameError:
    def to_dense(X):
        return X.toarray() if sparse.issparse(X) else X

# Índices de columnas categóricas en X crudo (para SMOTENC)
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

def make_knn_pipeline(balance: str):
    """
    balance ∈ {"BASE", "RUS", "SMOTENC", "ADASYN"}
    Devuelve (pipeline, param_grid) listos para GridSearchCV (cv=cv_final).
    Requiere: pre_sparse, svd_caps, RANDOM_STATE, memory, cat_idx.
    """
    if balance == "BASE":
        steps = [
            ("pre", pre_sparse),
            ("svd", TruncatedSVD(random_state=RANDOM_STATE)),
            ("knn", KNeighborsClassifier(algorithm="ball_tree", metric="minkowski", p=2))
        ]
    elif balance == "RUS":
        steps = [
            ("pre", pre_sparse),
            ("rus", RandomUnderSampler(random_state=RANDOM_STATE, sampling_strategy=0.8)),
            ("svd", TruncatedSVD(random_state=RANDOM_STATE)),
            ("knn", KNeighborsClassifier(algorithm="ball_tree", metric="minkowski", p=2))
        ]
    elif balance == "SMOTENC":
        # ⚡ versión acelerada: paralelo y menos vecinos
        steps = [
            ("smotenc", SMOTENC(
                categorical_features=cat_idx,
                random_state=RANDOM_STATE,
                n_jobs=-1,       # paralelo
                k_neighbors=3    # acelera la síntesis
            )),
            ("pre", pre_sparse),
            ("svd", TruncatedSVD(random_state=RANDOM_STATE)),
            ("knn", KNeighborsClassifier(algorithm="ball_tree", metric="minkowski", p=2))
        ]
    elif balance == "ADASYN":
        # ADASYN no acepta esparso → densificar tras el pre y antes de ADASYN
        steps = [
            ("pre", pre_sparse),
            ("to_dense", FunctionTransformer(
                to_dense, accept_sparse=True, check_inverse=False, validate=False
            )),
            ("adasyn", ADASYN(random_state=RANDOM_STATE)),
            ("svd", TruncatedSVD(random_state=RANDOM_STATE)),
            ("knn", KNeighborsClassifier(algorithm="ball_tree", metric="minkowski", p=2))
        ]
    else:
        raise ValueError("balance debe ser 'BASE', 'RUS', 'SMOTENC' o 'ADASYN'.")

    # Evitar problemas de pickling/caché SOLO en la rama ADASYN
    mem = None if balance == "ADASYN" else memory
    pipe = ImbPipeline(steps=steps, memory=mem)

    # Rejilla por defecto (puedes compactarla en SMOTENC/ADASYN para acelerar)
    param_grid = {
        "svd__n_components": svd_caps,   # p.ej. [30,45,60] filtrados por n_feat
        "knn__n_neighbors": [9, 15],
        "knn__leaf_size":  [30]
    }
    return pipe, param_grid




In [7]:
# === KNN-Base (sin balanceo) + SVD + BallTree — CV=5, t* OOF (sin leakage) ===
pipe_base, param_grid_base = make_knn_pipeline("BASE")
grid_base = GridSearchCV(
    estimator=pipe_base,
    param_grid=param_grid_base,
    scoring=SCORING,
    cv=cv_final,
    n_jobs=-1,
    refit=True,
    verbose=2,
    pre_dispatch="n_jobs"
)

best_base, t_star_base, dur_base, bestp_base, cv_ap_base = fit_with_val_threshold(
    grid_base, X_train, y_train, metric="f1", min_precision=0.20
)

print("\n=== KNN-Base (sin balanceo) + SVD + BallTree (CV=5) ===")
print("Best params:", bestp_base)
print(f"{SCORING} CV:", round(cv_ap_base, 4))
print(f"Duración (s): {dur_base:.1f}")

report_results(best_base, X_test, y_test,
               title="KNN-Base — test (thr=0.5)", threshold=0.5)
if t_star_base is not None:
    report_results(best_base, X_test, y_test,
                   title="KNN-Base — test (umbral óptimo validación)",
                   threshold=t_star_base)


Fitting 5 folds for each of 6 candidates, totalling 30 fits

=== KNN-Base (sin balanceo) + SVD + BallTree (CV=5) ===
Best params: {'knn__leaf_size': 30, 'knn__n_neighbors': 9, 'svd__n_components': 30}
average_precision CV: 0.2989
Duración (s): 987.0

=== KNN-Base — test (thr=0.5) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.7000    0.8991    0.7871     17524
           1     0.2904    0.0968    0.1453      7476

    accuracy                         0.6592     25000
   macro avg     0.4952    0.4979    0.4662     25000
weighted avg     0.5775    0.6592    0.5952     25000

Matriz de confusión (test):
[[15755  1769]
 [ 6752   724]]
ROC-AUC: 0.5025 | PR-AUC: 0.2988

=== KNN-Base — test (umbral óptimo validación) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.7219    0.0407    0.0771     17524
           1     0.2999    0.9632    0.4574      7476

    accuracy

In [8]:
# === KNN + RUS + SVD + BallTree — FINAL (CV=5, sin leakage) ===
pipe_rus, param_grid_rus = make_knn_pipeline("RUS")
grid_rus = GridSearchCV(
    estimator=pipe_rus,
    param_grid=param_grid_rus,
    scoring=SCORING,
    cv=cv_final,
    n_jobs=-1,
    refit=True,
    verbose=2,
    pre_dispatch="n_jobs"
)

best_rus, t_star_rus, dur_rus, bestp_rus, cv_ap_rus = fit_with_val_threshold(
    grid_rus, X_train, y_train, metric="f1", min_precision=0.20
)

print("\n=== KNN + RUS + SVD + BallTree (FINAL, CV=5) ===")
print("Best params:", bestp_rus)
print(f"{SCORING} CV:", round(cv_ap_rus, 4))
print(f"Duración (s): {dur_rus:.1f}")

report_results(best_rus, X_test, y_test,
               title="KNN + RUS + SVD + BallTree — test (thr=0.5)", threshold=0.5)
if t_star_rus is not None:
    report_results(best_rus, X_test, y_test,
                   title="KNN + RUS + SVD + BallTree — test (umbral óptimo validación)",
                   threshold=t_star_rus)


Fitting 5 folds for each of 6 candidates, totalling 30 fits

=== KNN + RUS + SVD + BallTree (FINAL, CV=5) ===
Best params: {'knn__leaf_size': 30, 'knn__n_neighbors': 15, 'svd__n_components': 30}
average_precision CV: 0.2998
Duración (s): 598.6

=== KNN + RUS + SVD + BallTree — test (thr=0.5) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.6976    0.6661    0.6815     17524
           1     0.2923    0.3233    0.3070      7476

    accuracy                         0.5636     25000
   macro avg     0.4950    0.4947    0.4943     25000
weighted avg     0.5764    0.5636    0.5695     25000

Matriz de confusión (test):
[[11673  5851]
 [ 5059  2417]]
ROC-AUC: 0.4923 | PR-AUC: 0.2950

=== KNN + RUS + SVD + BallTree — test (umbral óptimo validación) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.7500    0.0002    0.0003     17524
           1     0.2990    0.9999    

In [12]:
# === KNN + SMOTENC + SVD + BallTree — CV=5, t* hold-out (sin leakage) ===
pipe_smotenc, _ = make_knn_pipeline("SMOTENC")

# Rejilla compacta (reduce combinaciones; mantenemos CV=5 y PR-AUC)
param_grid_smotenc = {
    "svd__n_components": [30],   # si quieres 2 puntos: [30,45]
    "knn__n_neighbors":  [15],
    "knn__leaf_size":    [40]    # leaf un poco mayor acelera consultas
}

grid_smotenc = GridSearchCV(
    estimator=pipe_smotenc,
    param_grid=param_grid_smotenc,
    scoring=SCORING,   # "average_precision"
    cv=cv_final,       # CV=5 requerido
    n_jobs=-1,
    refit=True,
    verbose=2,
    pre_dispatch="2*n_jobs"   # mejor planificación que "n_jobs"
)

# ⚡ Usamos el helper rápido (hold-out interno) en vez de OOF pesado
best_smotenc, t_star_smotenc, dur_smotenc, bestp_smotenc, cv_ap_smotenc = fit_with_val_threshold_fast(
    grid_smotenc, X_train, y_train, metric="f1", min_precision=0.20, val_size=0.2, random_state=RANDOM_STATE
)

print("\n=== KNN + SMOTENC + SVD + BallTree (CV=5) ===")
print("Best params:", bestp_smotenc)
print(f"{SCORING} CV:", round(cv_ap_smotenc, 4))
print(f"Duración (s): {dur_smotenc:.1f}")

report_results(best_smotenc, X_test, y_test,
               title="KNN + SMOTENC + SVD + BallTree — test (thr=0.5)", threshold=0.5)
if t_star_smotenc is not None:
    report_results(best_smotenc, X_test, y_test,
                   title="KNN + SMOTENC + SVD + BallTree — test (umbral óptimo validación)",
                   threshold=t_star_smotenc)



Fitting 5 folds for each of 1 candidates, totalling 5 fits

=== KNN + SMOTENC + SVD + BallTree (CV=5) ===
Best params: {'knn__leaf_size': 40, 'knn__n_neighbors': 15, 'svd__n_components': 30}
average_precision CV: 0.2978
Duración (s): 500.9

=== KNN + SMOTENC + SVD + BallTree — test (thr=0.5) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.6984    0.2598    0.3787     17524
           1     0.2981    0.7370    0.4245      7476

    accuracy                         0.4025     25000
   macro avg     0.4983    0.4984    0.4016     25000
weighted avg     0.5787    0.4025    0.3924     25000

Matriz de confusión (test):
[[ 4553 12971]
 [ 1966  5510]]
ROC-AUC: 0.4990 | PR-AUC: 0.2995

=== KNN + SMOTENC + SVD + BallTree — test (umbral óptimo validación) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     17524
           1     0.2990    1.0000

In [18]:
# === KNN + ADASYN + SVD + BallTree — CV=5, t* hold-out (rápido y sin leakage) ===
# Requisitos previos ya definidos en el notebook:
# - make_knn_pipeline(balance) con FIX de ADASYN (to_dense global y memory=None en esa rama)
# - cv_final, SCORING, RANDOM_STATE
# - utils: predict_proba_safe, fit_with_val_threshold_fast
# - X_train, y_train, X_test, y_test

# 1) Pipeline ADASYN desde el constructor corregido
pipe_adasyn, _ = make_knn_pipeline("ADASYN")

# 2) Rejilla compacta (mantiene CV=5 + PR-AUC y acelera bastante)
param_grid_adasyn = {
    "svd__n_components": [30],   # si quieres 2 puntos añade 45: [30, 45]
    "knn__n_neighbors":  [15],   # 9 acelera más si lo necesitas
    "knn__leaf_size":    [40]    # hojas más grandes → consultas más rápidas
}

grid_adasyn = GridSearchCV(
    estimator=pipe_adasyn,
    param_grid=param_grid_adasyn,
    scoring=SCORING,   # "average_precision"
    cv=cv_final,       # 5 folds estratificados
    n_jobs=-1,
    refit=True,
    verbose=2,
    pre_dispatch="2*n_jobs"
)

# 3) Entrena y elige t* en hold-out interno (validación dentro de train; no usa test)
#    Endurecemos min_precision para evitar el colapso a "todo 1".
best_adasyn, t_star_adasyn, dur_adasyn, bestp_adasyn, cv_ap_adasyn = fit_with_val_threshold_fast(
    grid_adasyn, X_train, y_train,
    metric="f1", min_precision=0.35,   # sube a 0.40 si aún se te va a "todo 1"
    val_size=0.20, random_state=RANDOM_STATE
)

print("\n=== KNN + ADASYN + SVD + BallTree (CV=5) ===")
print("Best params:", bestp_adasyn)
print(f"{SCORING} CV:", round(cv_ap_adasyn, 4))
print(f"Duración (s): {dur_adasyn:.1f}")

# 4) PROBAS en test solo UNA VEZ (evita doble coste de KNN)
p_test_adasyn = predict_proba_safe(best_adasyn, X_test)

# --- Reporte @0.5
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

yhat_05 = (p_test_adasyn >= 0.5).astype(int)
print("\n=== KNN + ADASYN + SVD + BallTree — test (thr=0.5) ===\n")
print("Reporte de clasificación (test):")
print(classification_report(y_test, yhat_05, digits=4))
print("Matriz de confusión (test):")
print(confusion_matrix(y_test, yhat_05))
try:
    auc = roc_auc_score(y_test, p_test_adasyn)
except Exception:
    auc = float("nan")
ap = average_precision_score(y_test, p_test_adasyn)
print(f"ROC-AUC: {auc:.4f} | PR-AUC: {ap:.4f}")

# --- Reporte @t* (si existe)
if t_star_adasyn is not None:
    yhat_t = (p_test_adasyn >= t_star_adasyn).astype(int)
    print("\n=== KNN + ADASYN + SVD + BallTree — test (umbral óptimo validación) ===\n")
    print("Reporte de clasificación (test):")
    print(classification_report(y_test, yhat_t, digits=4))
    print("Matriz de confusión (test):")
    print(confusion_matrix(y_test, yhat_t))
    print(f"ROC-AUC: {auc:.4f} | PR-AUC: {ap:.4f}")




Fitting 5 folds for each of 1 candidates, totalling 5 fits

=== KNN + ADASYN + SVD + BallTree (CV=5) ===
Best params: {'knn__leaf_size': 40, 'knn__n_neighbors': 15, 'svd__n_components': 30}
average_precision CV: 0.3001
Duración (s): 208.3

=== KNN + ADASYN + SVD + BallTree — test (thr=0.5) ===

Reporte de clasificación (test):
              precision    recall  f1-score   support

           0     0.6992    0.1743    0.2790     17524
           1     0.2987    0.8242    0.4385      7476

    accuracy                         0.3686     25000
   macro avg     0.4989    0.4993    0.3587     25000
weighted avg     0.5794    0.3686    0.3267     25000

Matriz de confusión (test):
[[ 3054 14470]
 [ 1314  6162]]
ROC-AUC: 0.4976 | PR-AUC: 0.2970


In [20]:
# === COMPARATIVA CONSOLIDADA KNN (reutiliza probas si existen) ===
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)

def _get_p_test(model, X_test, p_cached_name=None):
    """Devuelve probabilidades de test. Si existe una variable cache (string), la usa."""
    if p_cached_name and (p_cached_name in globals()):
        return globals()[p_cached_name]
    return predict_proba_safe(model, X_test)

def add_variant(rows, name, p_test, thr_opt, cv_ap, t_fit_s):
    auc = roc_auc_score(y_test, p_test)
    ap  = average_precision_score(y_test, p_test)

    # @t* (si existe)
    if thr_opt is not None:
        yhat_t = (p_test >= float(thr_opt)).astype(int)
        rows.append({
            "Modelo": f"{name} (thr*F1)",
            "CV_AP": round(cv_ap, 4),
            "Test_PR-AUC": round(ap, 4),
            "Test_ROC-AUC": round(auc, 4),
            "Accuracy": round(accuracy_score(y_test, yhat_t), 4),
            "Precision": round(precision_score(y_test, yhat_t, zero_division=0), 4),
            "Recall": round(recall_score(y_test, yhat_t), 4),
            "F1": round(f1_score(y_test, yhat_t), 4),
            "Umbral": round(float(thr_opt), 4),
            "t_fit_s": round(t_fit_s, 1)
        })

    # @0.5
    yhat_05 = (p_test >= 0.5).astype(int)
    rows.append({
        "Modelo": f"{name} (thr=0.5)",
        "CV_AP": round(cv_ap, 4),
        "Test_PR-AUC": round(ap, 4),
        "Test_ROC-AUC": round(auc, 4),
        "Accuracy": round(accuracy_score(y_test, yhat_05), 4),
        "Precision": round(precision_score(y_test, yhat_05, zero_division=0), 4),
        "Recall": round(recall_score(y_test, yhat_05), 4),
        "F1": round(f1_score(y_test, yhat_05), 4),
        "Umbral": 0.5,
        "t_fit_s": round(t_fit_s, 1)
    })

rows = []
# Reutiliza si existen p_test_*; si no, las calcula al vuelo.
p_base    = _get_p_test(best_base,    X_test)
p_rus     = _get_p_test(best_rus,     X_test)
p_smotenc = _get_p_test(best_smotenc, X_test, p_cached_name="p_test_smotenc")
p_adasyn  = _get_p_test(best_adasyn,  X_test, p_cached_name="p_test_adasyn")

add_variant(rows, "KNN-Base+SVD",    p_base,    t_star_base,    cv_ap_base,    dur_base)
add_variant(rows, "KNN-RUS+SVD",     p_rus,     t_star_rus,     cv_ap_rus,     dur_rus)
add_variant(rows, "KNN-SMOTENC+SVD", p_smotenc, t_star_smotenc, cv_ap_smotenc, dur_smotenc)
add_variant(rows, "KNN-ADASYN+SVD",  p_adasyn,  t_star_adasyn,  cv_ap_adasyn,  dur_adasyn)

df_knn_compare = (
    pd.DataFrame(rows)
      .sort_values(["Test_PR-AUC", "F1"], ascending=False)
      .reset_index(drop=True)
)

df_knn_compare["Support_1"] = int((y_test == 1).sum())
df_knn_compare["Support_0"] = int((y_test == 0).sum())

# Mostrar tabla
df_knn_compare

# (Opcional) Exportar para el informe:
# df_knn_compare.to_csv("knn_comparativa.csv", index=False)


Unnamed: 0,Modelo,CV_AP,Test_PR-AUC,Test_ROC-AUC,Accuracy,Precision,Recall,F1,Umbral,t_fit_s,Support_1,Support_0
0,KNN-SMOTENC+SVD (thr*F1),0.2978,0.2995,0.499,0.299,0.299,1.0,0.4604,0.05,500.9,7476,17524
1,KNN-SMOTENC+SVD (thr=0.5),0.2978,0.2995,0.499,0.4025,0.2981,0.737,0.4245,0.5,500.9,7476,17524
2,KNN-Base+SVD (thr*F1),0.2989,0.2988,0.5025,0.3166,0.2999,0.9632,0.4574,0.05,987.0,7476,17524
3,KNN-Base+SVD (thr=0.5),0.2989,0.2988,0.5025,0.6592,0.2904,0.0968,0.1453,0.5,987.0,7476,17524
4,KNN-ADASYN+SVD (thr=0.5),0.3001,0.297,0.4976,0.3686,0.2987,0.8242,0.4385,0.5,208.3,7476,17524
5,KNN-RUS+SVD (thr*F1),0.2998,0.295,0.4923,0.2991,0.299,0.9999,0.4604,0.05,598.6,7476,17524
6,KNN-RUS+SVD (thr=0.5),0.2998,0.295,0.4923,0.5636,0.2923,0.3233,0.307,0.5,598.6,7476,17524
