
# Exoplanet ML — FAST (carga de modelos pre-entrenados + evaluación/umbral)

**Fecha:** 2025-10-04 20:10:34

Versión rápida para **evitar reentrenar**. Permite cargar tus `.pkl` ya entrenados y continuar con:
- **OOF evaluation** (si tenés `X,y`),
- **Curva PR** + **selección de umbral** (para bajar falsos positivos),
- **Calibración** de probabilidades y export de artefactos.

> Si preferís reentrenar, podés saltarte la celda de carga de pre-entrenados y usar los modelos definidos.


In [None]:

# ==== Setup & reproducibilidad ====
import os, warnings, json, joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import (roc_auc_score, average_precision_score, precision_recall_curve,
                             confusion_matrix, precision_score, recall_score, f1_score, roc_curve)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
np.random.seed(42)

MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)
REGISTRY_PATH = MODELS_DIR / "registry.json"


## Carga de datos

In [None]:

# ==== Carga de datos ====
# OPCIÓN A (CSV): descomentar y definir
# DATA_PATH = "data/dataset.csv"
# TARGET_COL = "label"
# df = pd.read_csv(DATA_PATH)
# y = df[TARGET_COL].astype(int).values
# feature_cols = [c for c in df.columns if c != TARGET_COL]
# X = df[feature_cols].values

# OPCIÓN B (X, y ya construidos en memoria): si ya están en el entorno, no hay nada que hacer aquí.

# OPCIÓN C (usar features.json para columnas) — si lo tenés disponible:
feature_cols = None
if Path("/mnt/data/features.json").exists():
    try:
        with open("/mnt/data/features.json", "r") as f:
            feature_cols = json.load(f)
            print("features.json detectado con", len(feature_cols), "features.")
    except Exception as e:
        print("No se pudo leer features.json:", e)

# Chequeo de X,y
try:
    X
    y
    print("X,y ya existen. Shape:", np.shape(X), "y positivos:", int(np.sum(y)))
    if feature_cols is not None and isinstance(X, np.ndarray) and X.shape[1] == len(feature_cols):
        print("Las columnas de features.json coinciden con X.shape[1].")
except NameError:
    print(">> Definí X,y o cargalos desde CSV. Ver instrucciones arriba.")


## Protocolo de evaluación y utilidades

In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def pr_metrics_from_probs(y_true, y_prob, threshold=0.5):
    ap = average_precision_score(y_true, y_prob)
    roc = roc_auc_score(y_true, y_prob)
    y_hat = (y_prob >= threshold).astype(int)
    prec = precision_score(y_true, y_hat, zero_division=0)
    rec  = recall_score(y_true, y_hat, zero_division=0)
    f1   = f1_score(y_true, y_hat, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    return {"PR_AUC": ap, "ROC_AUC": roc, "precision@thr": prec, "recall@thr": rec, "F1@thr": f1,
            "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn), "threshold": float(threshold)}

def pick_threshold_for_min_precision(y_true, y_prob, min_precision=0.90):
    prec, rec, thr = precision_recall_curve(y_true, y_prob)
    mask = prec >= min_precision
    thr_ok = thr[mask].min() if mask.any() else 0.5
    return float(thr_ok)

def plot_pr_curves(models_probs, y_true):
    plt.figure(figsize=(6,5))
    for name, probs in models_probs.items():
        prec, rec, _ = precision_recall_curve(y_true, probs)
        ap = average_precision_score(y_true, probs)
        plt.plot(rec, prec, label=f"{name} (AP={ap:.3f})")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Curva Precisión–Recall (OOF)")
    plt.legend(); plt.grid(True, alpha=0.3); plt.show()


## Definición de modelos (opcional si no cargás pre-entrenados)

In [None]:

rf  = RandomForestClassifier(n_estimators=600, criterion='entropy', random_state=42, n_jobs=-1, class_weight=None)
et  = ExtraTreesClassifier(n_estimators=600, criterion='entropy', random_state=42, n_jobs=-1)
ada = AdaBoostClassifier(n_estimators=600, learning_rate=0.1, random_state=42)

meta = LogisticRegression(max_iter=2000)
stack = StackingClassifier(estimators=[('rf', rf), ('et', et), ('ada', ada)],
                           final_estimator=meta, cv=cv, passthrough=False, n_jobs=-1)

models = {"RandomForest": rf, "ExtraTrees": et, "AdaBoost": ada, "Stacking": stack}
print("Modelos definidos (para reentrenar si querés):", list(models.keys()))


## (Rápido) Cargar modelos pre-entrenados desde `.pkl`

In [None]:

# Busca en /mnt/data y en models/ varias variantes de nombre, incluidas las que subiste.
import joblib
from pathlib import Path

CANDIDATE_PATHS = {
    "RandomForest": [
        Path("/mnt/data/random_forest.pkl"),
        Path("/mnt/data/Random Forest_model.pkl"),
        Path("models/random_forest.pkl"),
    ],
    "ExtraTrees": [
        Path("/mnt/data/extra_trees.pkl"),
        Path("/mnt/data/Extra Trees_model.pkl"),
        Path("models/extra_trees.pkl"),
    ],
    "AdaBoost": [
        Path("/mnt/data/adaboost.pkl"),
        Path("/mnt/data/AdaBoost_model.pkl"),
        Path("models/adaboost.pkl"),
    ],
    "Stacking": [
        Path("/mnt/data/stacking.pkl"),
        Path("models/stacking.pkl"),
    ]
}

loaded = {}
for name, paths in CANDIDATE_PATHS.items():
    loaded_flag = False
    for p in paths:
        if p.exists():
            try:
                m = joblib.load(p)
                loaded[name] = m
                print(f"✅ Cargado modelo preentrenado: {name} desde {p}")
                loaded_flag = True
                break
            except Exception as e:
                print(f"⚠️ No se pudo cargar {name} desde {p}: {e}")
    if not loaded_flag:
        print(f"⚠️ No se encontró archivo válido para {name} (se usará el modelo definido si es necesario).")

if loaded:
    models = loaded  # sobrescribe el diccionario models
    print("\nUsando modelos preentrenados para la evaluación.")
else:
    print("\nNo se encontraron modelos preentrenados, se usarán los modelos definidos (y deberás entrenarlos).")


## Evaluación OOF (mismo CV)

In [None]:

try:
    X; y
except NameError:
    raise RuntimeError("Definí X,y antes de correr esta celda. Ver 'Carga de datos'.")

oof_probs = {}
scores_tbl = []

for name, clf in models.items():
    pipe = Pipeline([("clf", clf)])  # agrega scaler si hace falta
    probs = cross_val_predict(pipe, X, y, cv=cv, method="predict_proba")[:,1]
    oof_probs[name] = probs
    metrics = pr_metrics_from_probs(y, probs, threshold=0.5)
    scores_tbl.append({"model": name, **metrics})
    print(f"{name}  -> PR-AUC={metrics['PR_AUC']:.4f} | ROC-AUC={metrics['ROC_AUC']:.4f} | "
          f"Prec@0.5={metrics['precision@thr']:.3f} | Recall@0.5={metrics['recall@thr']:.3f}")

scores_df = pd.DataFrame(scores_tbl).sort_values("PR_AUC", ascending=False).reset_index(drop=True)
scores_df


### Curvas PR (comparación OOF)

In [None]:

plot_pr_curves(oof_probs, y_true=y)


## Selección de modelo + umbral

In [None]:

best_name = scores_df.loc[0, "model"]
print("Mejor por PR-AUC (OOF):", best_name)

MIN_PRECISION = 0.90
thr_best = pick_threshold_for_min_precision(y, oof_probs[best_name], min_precision=MIN_PRECISION)
print(f"Umbral elegido para {best_name} con precisión ≥ {MIN_PRECISION:.2f}: {thr_best:.3f}")

op_metrics = pr_metrics_from_probs(y, oof_probs[best_name], threshold=thr_best)
pd.DataFrame([op_metrics])


## Calibración y entrenamiento final (full data)

In [None]:

base = models[best_name]
pipe = Pipeline([("clf", base)])  # agrega scaler si tus features lo requieren

calibrated = CalibratedClassifierCV(pipe, method="isotonic", cv=cv)
calibrated.fit(X, y)

from datetime import datetime
final_model_path = Path("models") / f"{best_name.lower()}_calibrated.pkl"
joblib.dump(calibrated, final_model_path)
print("✅ Modelo calibrado guardado en:", final_model_path)

export = {
    "model_name": best_name,
    "threshold": float(thr_best),
    "feature_cols": feature_cols if feature_cols is not None else (list(range(X.shape[1])) if hasattr(X, 'shape') else None),
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "min_precision_target": float(MIN_PRECISION)
}
with open(Path("models") / "inference_config.json", "w") as f:
    json.dump(export, f, indent=2)
print("✅ inference_config.json guardado.")


## Registry de experimentos

In [None]:

from datetime import datetime

registry_entry = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "cv": {"n_splits": 5, "shuffle": True, "random_state": 42},
    "scores": scores_df.to_dict(orient="records"),
    "selected_model": best_name,
    "operating_point": {"threshold": float(thr_best), **pr_metrics_from_probs(y, oof_probs[best_name], threshold=thr_best)},
    "artifacts": {"model_path": f"models/{best_name.lower()}_calibrated.pkl", "inference_config": "models/inference_config.json"}
}

REGISTRY_PATH = Path("models") / "registry.json"
if REGISTRY_PATH.exists():
    try:
        reg = json.loads(REGISTRY_PATH.read_text())
        if not isinstance(reg, list): reg = [reg]
    except Exception:
        reg = []
else:
    reg = []

reg.append(registry_entry)
REGISTRY_PATH.write_text(json.dumps(reg, indent=2))
print("✅ Registry actualizado en", REGISTRY_PATH)


## Inferencia con umbral

In [None]:

def load_model_and_config(model_path=None, config_path=None):
    model_path = model_path or (Path("models") / f"{best_name.lower()}_calibrated.pkl")
    config_path = config_path or (Path("models") / "inference_config.json")
    model = joblib.load(model_path)
    with open(config_path, "r") as f:
        cfg = json.load(f)
    return model, cfg

def predict_with_threshold(X_new, model, threshold):
    probs = model.predict_proba(X_new)[:,1]
    return (probs >= threshold).astype(int), probs

# Ejemplo (descomentar cuando tengas X_new):
# model, cfg = load_model_and_config()
# y_pred, y_prob = predict_with_threshold(X_new, model, cfg["threshold"])
# print("Preds:", y_pred[:10], "Probs:", y_prob[:10])


## Housekeeping (opcional)

In [None]:

MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)
existing = list(MODELS_DIR.glob("*.pkl"))
print("Archivos actuales en models/:")
for p in existing:
    print("-", p.name)
print("\nSugerencia: conservar sólo los modelos calibrados y registrar métricas en registry.json")
