In [None]:
# 03_modeling.ipynb

# ---------------------------------------------
# 🤖 Modelado Predictivo - Open Banking Challenge
# Autor: Pablo Flores
# ---------------------------------------------

In [None]:
# 🔧 1. Librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 📁 2. Cargar dataset final
df = pd.read_csv("data/processed/final_dataset.csv")

In [None]:
# 🔍 3. Separar features y target
X = df.drop("has_insurance", axis=1)
y = df["has_insurance"]

In [None]:
# 🔀 4. División Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# 📈 5. Entrenar modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n🔹 Entrenando: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print(classification_report(y_test, y_pred))
    auc = roc_auc_score(y_test, y_proba)
    print(f"AUC-ROC: {auc:.4f}")
    
    RocCurveDisplay.from_predictions(y_test, y_proba)
    plt.title(f"Curva ROC - {name}")
    plt.show()
    
    results[name] = {
        "model": model,
        "auc": auc
    }


In [None]:
# 🌟 6. Feature Importance (Random Forest)
importances = results["Random Forest"]["model"].feature_importances_
feature_names = X.columns
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(15)

sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("🔍 Importancia de features - Random Forest")
plt.xlabel("Importancia")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [None]:

# 💾 7. Guardar mejor modelo
import joblib
best_model_name = max(results, key=lambda m: results[m]["auc"])
best_model = results[best_model_name]["model"]

joblib.dump(best_model, f"outputs/models/{best_model_name.replace(' ', '_').lower()}_model.pkl")
print(f"✅ Modelo guardado: {best_model_name}")
