## Import the libraries

In [11]:
# Data manipulation
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    roc_curve
)

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score


#### Load the saved data

In [6]:
data = joblib.load("data/processed/modeling_data.joblib")

X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]
feature_names = data["feature_names"]

### General Grid Search Config

In [9]:
cv = 5
scoring = 'roc_auc'
n_jobs = -1

## Logistic Regression

In [10]:
lr_params ={
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000]
}

In [12]:
lr_grid = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid=lr_params,
    scoring=scoring,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [13]:
lr_grid.fit(X_train, y_train)
print(f"Best params: {lr_grid.best_params_}")
print(f"Best ROC-AUC: {lr_grid.best_score_:.4f}\n")

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

## Random Forest

In [None]:
rf_params = {
    'n_estimators': [100, 200, 300],  # Número de árboles
    'max_depth': [10, 20, 30, None],  # Profundidad máxima
    'min_samples_split': [2, 5, 10],  # Mínimo para dividir nodo
    'min_samples_leaf': [1, 2, 4],  # Mínimo en hoja
    'max_features': ['sqrt'],
    'class_weight': [None, 'balanced']
}

In [None]:
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_params,
    scoring=scoring,
    cv=cv,
    n_jobs=1,
    verbose=1
)

In [None]:
rf_grid.fit(X_train, y_train)
print(f"Best params: {rf_grid.best_params_}")
print(f"Best ROC-AUC: {rf_grid.best_score_:.4f}\n")

## XGBoost

In [None]:
xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],  # Similar a min_samples_leaf
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],  # Fracción de features
    'gamma': [0, 0.1, 0.2],  # Regularización
    'scale_pos_weight': [1, (y_train == 0).sum() / (y_train == 1).sum()]  # Desbalanceo
}

In [None]:
xgb_grid = GridSearchCV(
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    param_grid=xgb_params,
    scoring=scoring,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [None]:
xgb_grid.fit(X_train, y_train)
print(f"Best params: {xgb_grid.best_params_}")
print(f"Best ROC-AUC: {xgb_grid.best_score_:.4f}\n")

## Gradient Boosting

In [None]:
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Tasa de aprendizaje
    'max_depth': [3, 5, 7],  # Boosting usa árboles más pequeños
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]  # Fracción de muestras por árbol
}


In [None]:
gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid=gb_params,
    scoring=scoring,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [None]:
gb_grid.fit(X_train, y_train)
print(f"Best params: {gb_grid.best_params_}")
print(f"Best ROC-AUC: {gb_grid.best_score_:.4f}\n")

## LightBM

In [None]:
lgbm_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, -1],  # -1 = sin límite
    'num_leaves': [31, 50, 100],  # LightGBM usa leaf-wise
    'min_child_samples': [20, 30, 50],  # Similar a min_samples_leaf
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.5],  # L2 regularization
    'class_weight': [None, 'balanced']
}

In [None]:
lgbm_grid = GridSearchCV(
    LGBMClassifier(random_state=42, verbose=-1),
    param_grid=lgbm_params,
    scoring=scoring,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)


In [None]:
lgbm_grid.fit(X_train, y_train)
print(f"Best params: {lgbm_grid.best_params_}")
print(f"Best ROC-AUC: {lgbm_grid.best_score_:.4f}\n")

## Model Comparison

In [None]:
models = {
    'Logistic Regression': lr_grid,
    'Random Forest': rf_grid,
    'Gradient Boosting': gb_grid,
    'XGBoost': xgb_grid,
    'LightGBM': lgbm_grid
}

results = pd.DataFrame({
    'Model': list(models.keys()),
    'Best CV ROC-AUC': [model.best_score_ for model in models.values()],
    'Best Params': [model.best_params_ for model in models.values()]
})

results = results.sort_values('Best CV ROC-AUC', ascending=False)
print("\n" + "="*80)
print("RANKING of MODELOS (by ROC-AUC in Cross-Validation)")
print("="*80)
print(results.to_string(index=False))

# Save the best model
best_model_name = results.iloc[0]['Model']
best_model = models[best_model_name].best_estimator_

joblib.dump(best_model, 'models/best_model.pkl')
print(f"\n Best model {best_model_name} is saved")