# Entrenamiento y Exportación de Modelo usando XGBoost
Este notebook entrena un clasificador sobre KOI (cumulative), exporta artefactos ( scaler + label encoder + stats) y provee una prueba con TCEs.

In [1]:
# Limpio si instalaste TF antes (opcional)
!pip -q uninstall -y tensorflow tensorflow-cpu || true

# Instalar lo necesario
!pip -q install pandas numpy scikit-learn xgboost requests joblib

[0m

In [2]:
# Imports
import io, json, os, requests
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import joblib


In [3]:
# @title Descargar KOI cumulative (API Exoplanet Archive)
BASE = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI"

cols = [
 "kepoi_name","koi_disposition","koi_pdisposition","koi_score",
 "koi_period","koi_duration","koi_depth","koi_prad","koi_srad",
 "koi_teq","koi_steff","koi_slogg","koi_smet","koi_kepmag",
 "koi_model_snr","koi_num_transits"
]

where = ("koi_disposition like 'CONFIRMED' or "
         "koi_disposition like 'CANDIDATE' or "
         "koi_disposition like 'FALSE POSITIVE'")

params = {
    "table": "cumulative",
    "select": ",".join(cols),
    "where": where,
    "format": "csv"
}

r = requests.get(BASE, params=params, timeout=60)
r.raise_for_status()
df = pd.read_csv(io.StringIO(r.text))
print(df.shape, df.koi_disposition.value_counts())
df.to_csv("exo_full.csv", index=False)

(9564, 16) koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2746
CANDIDATE         1979
Name: count, dtype: int64


In [4]:
# @title Split train/test y preprocesamiento
df_tr, df_te = train_test_split(df, test_size=0.2, random_state=42, stratify=df["koi_disposition"])
df_tr = df_tr.dropna()
df_te = df_te.dropna()

feature_columns = ['koi_period', 'koi_duration', 'koi_depth', 'koi_prad',
                   'koi_srad', 'koi_teq', 'koi_steff', 'koi_slogg',
                   'koi_smet', 'koi_kepmag', 'koi_model_snr', 'koi_num_transits']

X_tr = df_tr[feature_columns].values
X_te = df_te[feature_columns].values

label_encoder = LabelEncoder()
y_tr = label_encoder.fit_transform(df_tr['koi_disposition'])
y_te = label_encoder.transform(df_te['koi_disposition'])

scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

print("Clases:", label_encoder.classes_)
print("X_tr:", X_tr.shape, "y_tr:", y_tr.shape)

Clases: ['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']
X_tr: (6389, 12) y_tr: (6389,)


In [5]:
# @title Instalamos XGBoost
!pip -q install xgboost

In [6]:
# @title Imports para XGBoost
import numpy as np, pandas as pd, joblib, json
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [7]:
# @title Entrenamos con XGBoost (multiclase)
# Requiere: X_tr, y_tr, X_te, y_te, label_encoder ya definidos

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='mlogloss'
)
xgb.fit(X_tr, y_tr)
y_pred = xgb.predict(X_te)
print(classification_report(y_te, y_pred, target_names=label_encoder.classes_))

                precision    recall  f1-score   support

     CANDIDATE       0.60      0.60      0.60       267
     CONFIRMED       0.84      0.85      0.84       547
FALSE POSITIVE       0.89      0.88      0.89       785

      accuracy                           0.82      1599
     macro avg       0.78      0.78      0.78      1599
  weighted avg       0.82      0.82      0.82      1599



In [8]:
# @title Exportamos artefactos XGBoost
# Guarda: xgb_model.pkl, scaler.pkl, label_encoder.pkl, feature_stats.json
import joblib, json

joblib.dump(xgb, "xgb_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

feature_stats = {
    "feature_columns": feature_columns,
    "medians": df_tr[feature_columns].median(numeric_only=True).to_dict()
}
with open("feature_stats.json","w") as f:
    json.dump(feature_stats, f, indent=2)

print("✅ Artefactos exportados: xgb_model.pkl, scaler.pkl, label_encoder.pkl, feature_stats.json")

✅ Artefactos exportados: xgb_model.pkl, scaler.pkl, label_encoder.pkl, feature_stats.json


In [9]:
# @title (Prueba) Inferencia con XGBoost usando artefactos exportados
# Clasifica 2 TCEs de Kepler con tu modelo XGB
import io, requests

def predict_with_artifacts(df_cases: pd.DataFrame):
    xgb = joblib.load("xgb_model.pkl")
    sc = joblib.load("scaler.pkl")
    le = joblib.load("label_encoder.pkl")
    stats = json.load(open("feature_stats.json"))
    feature_columns = stats["feature_columns"]
    medians = stats["medians"]
    for c in feature_columns:
        if c not in df_cases.columns:
            df_cases[c] = np.nan
    df_cases = df_cases[feature_columns].copy()
    for c in feature_columns:
        if df_cases[c].isna().any():
            df_cases[c] = df_cases[c].fillna(medians.get(c, 0.0))
    X = sc.transform(df_cases.values)
    probs = xgb.predict_proba(X)
    preds = le.inverse_transform(np.argmax(probs, axis=1))
    return preds, probs, list(le.classes_)

# Descargar 2 TCEs con mayor SNR
TAP_URL = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync"
query = """
SELECT TOP 2
  kepid, tce_plnt_num, tce_period, tce_duration, tce_depth, tce_model_snr
FROM q1_q17_dr25_tce
WHERE tce_period > 0 AND tce_duration > 0 AND tce_depth > 0
ORDER BY tce_model_snr DESC
"""
r = requests.get(TAP_URL, params={"query": query, "format": "csv"}, timeout=90)
r.raise_for_status()
tce = pd.read_csv(io.StringIO(r.text))
tce.columns = [c.strip().lower() for c in tce.columns]

# Map mínimo TCE -> features KOI
feature_columns = json.load(open("feature_stats.json"))["feature_columns"]
cases = pd.DataFrame(index=tce.index, columns=feature_columns, dtype="float64")
mapping = {"koi_period":"tce_period","koi_duration":"tce_duration","koi_depth":"tce_depth","koi_model_snr":"tce_model_snr"}
for feat, src in mapping.items():
    if src in tce.columns:
        cases[feat] = pd.to_numeric(tce[src], errors="coerce")

preds, probs, classes = predict_with_artifacts(cases)
print("Predicciones:", preds)
print("Clases:", classes)

Predicciones: ['CONFIRMED' 'FALSE POSITIVE']
Clases: ['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']
