# ü§ñ NeoScore - Credit Scoring Models

**Autor**: Luca Camus  
**Fecha**: Enero 2026  
**Objetivo**: Entrenar modelos de ML para predecir riesgo crediticio

**Modelos a implementar**:
1. Logistic Regression (baseline interpretable)
2. Random Forest (ensemble robusto)
3. XGBoost (estado del arte)

## 1. Configuraci√≥n

In [None]:
# Instalar dependencias
!pip install google-cloud-bigquery pandas matplotlib seaborn scikit-learn xgboost imbalanced-learn --quiet

In [None]:
# Imports
from google.colab import auth
auth.authenticate_user()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery

# Scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, roc_curve, confusion_matrix, 
    classification_report, precision_recall_curve,
    f1_score, accuracy_score
)

# XGBoost
from xgboost import XGBClassifier

# Imbalanced-learn (si hay desbalanceo)
from imblearn.over_sampling import SMOTE

# Configuraci√≥n
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

print('‚úÖ Configuraci√≥n completa')

## 2. Cargar Datos

In [None]:
# Cliente BigQuery
PROJECT_ID = 'scoring-bancario'
client = bigquery.Client(project=PROJECT_ID)

# Cargar datos
query = """
SELECT *
FROM `scoring-bancario.analisis_bancario.customer_features`
"""

df = client.query(query).to_dataframe()
print(f'üìä Dataset cargado: {df.shape[0]:,} clientes x {df.shape[1]} features')

## 3. Preparaci√≥n de Features (¬°Evitando Leakage!)

‚ö†Ô∏è **IMPORTANTE**: NO usar `preliminary_credit_score` como feature

In [None]:
# Features PERMITIDAS (datos crudos, sin leakage)
FEATURES = [
    'age',                      # Demograf√≠a
    'avg_balance',              # Balance
    'last_balance',
    'min_balance',
    'max_balance',
    'total_spend',              # Gasto
    'avg_spend',
    'max_spend',
    'min_spend',
    'std_spend',
    'total_transactions',       # Actividad
    'days_active',
    'unique_transaction_days',
    'transaction_frequency',
    'spend_to_balance_ratio',   # Ratios
    'spend_volatility',
    'avg_daily_transactions',
    'avg_daily_spend',
]

# Variable objetivo
TARGET = 'high_risk_flag'

print(f'üìä Features a usar: {len(FEATURES)}')
print(f'üéØ Variable objetivo: {TARGET}')

In [None]:
# Verificar features disponibles
available_features = [f for f in FEATURES if f in df.columns]
missing_features = [f for f in FEATURES if f not in df.columns]

print(f'‚úÖ Features disponibles: {len(available_features)}')
if missing_features:
    print(f'‚ö†Ô∏è Features no encontradas: {missing_features}')

FEATURES = available_features

In [None]:
# Crear X e y
X = df[FEATURES].copy()
y = df[TARGET].copy()

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'\nüìä Distribuci√≥n del target:')
print(y.value_counts(normalize=True).round(4) * 100)

In [None]:
# Manejar valores nulos
print('\nüìä Nulos por columna antes de imputar:')
null_counts = X.isnull().sum()
print(null_counts[null_counts > 0])

# Imputar nulos con la mediana
X = X.fillna(X.median())

print('\n‚úÖ Nulos despu√©s de imputar:', X.isnull().sum().sum())

## 4. Divisi√≥n Train/Test

In [None]:
# Divisi√≥n estratificada (mantiene proporci√≥n de clases)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f'üìä Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.0f}%)')
print(f'üìä Test:  {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.0f}%)')

print(f'\nüìä Distribuci√≥n en Train:')
print(y_train.value_counts(normalize=True).round(4) * 100)

print(f'\nüìä Distribuci√≥n en Test:')
print(y_test.value_counts(normalize=True).round(4) * 100)

In [None]:
# Escalar features para Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('‚úÖ Features escaladas')

## 5. Modelo 1: Logistic Regression (Baseline)

In [None]:
# Entrenar Logistic Regression
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced'  # Maneja desbalanceo
)

lr_model.fit(X_train_scaled, y_train)

# Predicciones
y_pred_lr = lr_model.predict(X_test_scaled)
y_prob_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# M√©tricas
lr_auc = roc_auc_score(y_test, y_prob_lr)
lr_gini = 2 * lr_auc - 1

print('=' * 50)
print('üìä LOGISTIC REGRESSION - Resultados')
print('=' * 50)
print(f'ROC-AUC: {lr_auc:.4f}')
print(f'Gini:    {lr_gini:.4f}')
print(f'\n{classification_report(y_test, y_pred_lr, target_names=["Low Risk", "High Risk"])}')

In [None]:
# Coeficientes (interpretabilidad)
coef_df = pd.DataFrame({
    'Feature': FEATURES,
    'Coeficiente': lr_model.coef_[0]
}).sort_values('Coeficiente', key=abs, ascending=False)

print('üìä Features m√°s importantes (Logistic Regression):')
print(coef_df.head(10).to_string(index=False))

## 6. Modelo 2: Random Forest

In [None]:
# Entrenar Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

rf_model.fit(X_train, y_train)  # No necesita escalado

# Predicciones
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# M√©tricas
rf_auc = roc_auc_score(y_test, y_prob_rf)
rf_gini = 2 * rf_auc - 1

print('=' * 50)
print('üìä RANDOM FOREST - Resultados')
print('=' * 50)
print(f'ROC-AUC: {rf_auc:.4f}')
print(f'Gini:    {rf_gini:.4f}')
print(f'\n{classification_report(y_test, y_pred_rf, target_names=["Low Risk", "High Risk"])}')

In [None]:
# Feature Importance
importance_df = pd.DataFrame({
    'Feature': FEATURES,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Visualizar
plt.figure(figsize=(10, 8))
plt.barh(importance_df['Feature'][::-1], importance_df['Importance'][::-1], color='steelblue')
plt.xlabel('Importancia')
plt.title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print('\nüìä Top 10 Features (Random Forest):')
print(importance_df.head(10).to_string(index=False))

## 7. Modelo 3: XGBoost

In [None]:
# Calcular scale_pos_weight para manejar desbalanceo
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f'Scale pos weight: {scale_pos_weight:.2f}')

# Entrenar XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='auc',
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)

# Predicciones
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# M√©tricas
xgb_auc = roc_auc_score(y_test, y_prob_xgb)
xgb_gini = 2 * xgb_auc - 1

print('=' * 50)
print('üìä XGBOOST - Resultados')
print('=' * 50)
print(f'ROC-AUC: {xgb_auc:.4f}')
print(f'Gini:    {xgb_gini:.4f}')
print(f'\n{classification_report(y_test, y_pred_xgb, target_names=["Low Risk", "High Risk"])}')

## 8. Comparaci√≥n de Modelos

In [None]:
# Resumen de m√©tricas
results = pd.DataFrame({
    'Modelo': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'ROC-AUC': [lr_auc, rf_auc, xgb_auc],
    'Gini': [lr_gini, rf_gini, xgb_gini]
}).sort_values('ROC-AUC', ascending=False)

print('=' * 50)
print('üìä COMPARACI√ìN DE MODELOS')
print('=' * 50)
print(results.to_string(index=False))

In [None]:
# Curvas ROC
fig, ax = plt.subplots(figsize=(10, 8))

# Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
ax.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC={lr_auc:.4f})', linewidth=2)

# Random Forest
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
ax.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC={rf_auc:.4f})', linewidth=2)

# XGBoost
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)
ax.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC={xgb_auc:.4f})', linewidth=2)

# Diagonal
ax.plot([0, 1], [0, 1], 'k--', label='Random (AUC=0.5)')

ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('Curvas ROC - Comparaci√≥n de Modelos', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. C√°lculo de KS Statistic

In [None]:
def calculate_ks(y_true, y_prob):
    """Calcula KS Statistic"""
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    ks = max(tpr - fpr)
    return ks

# KS para cada modelo
ks_lr = calculate_ks(y_test, y_prob_lr)
ks_rf = calculate_ks(y_test, y_prob_rf)
ks_xgb = calculate_ks(y_test, y_prob_xgb)

print('=' * 50)
print('üìä KS STATISTIC')
print('=' * 50)
print(f'Logistic Regression: KS = {ks_lr:.4f}')
print(f'Random Forest:       KS = {ks_rf:.4f}')
print(f'XGBoost:             KS = {ks_xgb:.4f}')
print('\nInterpretaci√≥n: KS > 0.40 es excelente, > 0.30 es bueno')

## 10. Tabla Final de Resultados

In [None]:
# Tabla final
final_results = pd.DataFrame({
    'Modelo': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'ROC-AUC': [lr_auc, rf_auc, xgb_auc],
    'Gini': [lr_gini, rf_gini, xgb_gini],
    'KS': [ks_lr, ks_rf, ks_xgb]
}).round(4)

# Agregar ranking
final_results['Ranking'] = final_results['ROC-AUC'].rank(ascending=False).astype(int)
final_results = final_results.sort_values('Ranking')

print('=' * 60)
print('üìä RESULTADOS FINALES - NeoScore Credit Scoring')
print('=' * 60)
print(final_results.to_string(index=False))
print('=' * 60)

# Mejor modelo
best_model = final_results.iloc[0]['Modelo']
best_auc = final_results.iloc[0]['ROC-AUC']
print(f'\nüèÜ MEJOR MODELO: {best_model} (AUC={best_auc:.4f})')

In [None]:
# Visualizaci√≥n final
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

models = ['Logistic\nRegression', 'Random\nForest', 'XGBoost']
colors = ['#3498db', '#2ecc71', '#e74c3c']

# ROC-AUC
axes[0].bar(models, [lr_auc, rf_auc, xgb_auc], color=colors)
axes[0].set_title('ROC-AUC', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, 1)
axes[0].axhline(0.5, color='gray', linestyle='--', alpha=0.5)
for i, v in enumerate([lr_auc, rf_auc, xgb_auc]):
    axes[0].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# Gini
axes[1].bar(models, [lr_gini, rf_gini, xgb_gini], color=colors)
axes[1].set_title('Gini', fontsize=14, fontweight='bold')
axes[1].set_ylim(0, 1)
for i, v in enumerate([lr_gini, rf_gini, xgb_gini]):
    axes[1].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# KS
axes[2].bar(models, [ks_lr, ks_rf, ks_xgb], color=colors)
axes[2].set_title('KS Statistic', fontsize=14, fontweight='bold')
axes[2].set_ylim(0, 1)
axes[2].axhline(0.40, color='green', linestyle='--', alpha=0.5, label='Excelente (0.40)')
axes[2].axhline(0.30, color='orange', linestyle='--', alpha=0.5, label='Bueno (0.30)')
for i, v in enumerate([ks_lr, ks_rf, ks_xgb]):
    axes[2].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')
axes[2].legend(loc='upper right')

plt.tight_layout()
plt.show()

## 11. Conclusiones

In [None]:
print('=' * 60)
print('üìä CONCLUSIONES - NeoScore Credit Scoring')
print('=' * 60)

print(f'''
1. MEJOR MODELO: {best_model}
   - ROC-AUC: {final_results.iloc[0]["ROC-AUC"]:.4f}
   - Gini: {final_results.iloc[0]["Gini"]:.4f}
   - KS: {final_results.iloc[0]["KS"]:.4f}

2. FEATURES M√ÅS IMPORTANTES:
''')
print(importance_df.head(5).to_string(index=False))

print(f'''
3. INTERPRETACI√ìN:
   - AUC > 0.70: Modelo aceptable para producci√≥n
   - Gini > 0.40: Buena capacidad discriminativa
   - KS > 0.30: Buena separaci√≥n entre clases

4. PR√ìXIMOS PASOS:
   - Optimizar hiperpar√°metros del mejor modelo
   - Validar con cross-validation m√°s rigurosa
   - Crear pipeline de producci√≥n
   - Documentar modelo (Model Card)
''')

print('=' * 60)