## Import the libraries

In [3]:
# Data manipulation
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    fbeta_score,
    make_scorer
)

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score


#### Load the saved data

In [36]:
data = joblib.load("data/processed/modeling_data.joblib")

X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]
feature_names = data["feature_names"]

### General Grid Search Config

In [5]:
f2_scorer = make_scorer(fbeta_score, beta=2.0)

In [6]:
cv = 5
scoring = f2_scorer
n_jobs = -1
n_iter = 15

## Logistic Regression

In [7]:
lr_params ={
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced'],
    'max_iter': [1000],
    'l1_ratio': [0.5]
}

In [10]:
lr_grid = RandomizedSearchCV(
    LogisticRegression(random_state=42),
    param_distributions=lr_params,
    scoring=scoring,
    n_iter=30, # space of parameters
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [11]:
lr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


25 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jose_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jose_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\jose_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1218, in fit
    solver = _check_solver(se

0,1,2
,estimator,LogisticRegre...ndom_state=42)
,param_distributions,"{'C': [0.001, 0.01, ...], 'class_weight': ['balanced'], 'l1_ratio': [0.5], 'max_iter': [1000], ...}"
,n_iter,30
,scoring,"make_scorer(f...ct', beta=2.0)"
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [12]:
print(f"Best params: {lr_grid.best_params_}")
print(f"Best F2 score (Beta=2): {lr_grid.best_score_:.4f}\n")

Best params: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 1000, 'l1_ratio': 0.5, 'class_weight': 'balanced', 'C': 10}
Best F2 score (Beta=2): 0.4736



## Random Forest

In [24]:
rf_params = {
    'n_estimators': [100],
    'max_depth': [5, 10],
    'min_samples_split': [5],
    'min_samples_leaf': [4],
    'max_features': ['sqrt']
}

In [25]:
rf_grid = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=rf_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [26]:
rf_grid.fit(X_train, y_train)



Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}
Best ROC-AUC: 0.0000



In [42]:
print(f"Best params: {rf_grid.best_params_}")
print(f"Best F2 score (Beta 2): {rf_grid.best_score_:.4f}\n")

Best params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}
Best F2 score (Beta 2): 0.0000



## XGBoost

In [27]:
ratio = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class Ratio: {ratio:.4f}")

xgb_params = {
    'n_estimators': [100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.5],
    'scale_pos_weight': [ratio, ratio*2, ratio*3]
}

Class Ratio: 6.7734


In [28]:
xgb_grid = RandomizedSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    param_distributions=xgb_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [29]:
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


2 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jose_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jose_\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 751, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\jose_\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 1787, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^

0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': [0.8, 1.0], 'gamma': [0, 0.1, ...], 'learning_rate': [0.01, 0.05, ...], 'max_depth': [3, 5, ...], ...}"
,n_iter,15
,scoring,"make_scorer(f...ct', beta=2.0)"
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [30]:
print(f"Best params: {xgb_grid.best_params_}")
print(f"Best F2 score (Beta=2): {xgb_grid.best_score_:.4f}\n")

Best params: {'subsample': 0.8, 'scale_pos_weight': np.float64(13.546739677040845), 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 1.0}
Best F2 score (Beta=2): 0.4985



## Gradient Boosting

In [39]:
gb_params = {
    'n_estimators': [100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [5],
    'min_samples_split': [5],
    'min_samples_leaf': [2],
    'subsample': [0.8, 1.0]
}


In [40]:
gb_grid = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_distributions=gb_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [41]:
gb_grid.fit(X_train, y_train)
print(f"Best params: {gb_grid.best_params_}")
print(f"Best F2 score (Beta=2): {gb_grid.best_score_:.4f}\n")



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.2}
Best F2 score (Beta=2): 0.0180



## LightGBM

In [31]:
lgbm_params = {
    'n_estimators': [100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [30, 50, 70],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0],
    'class_weight': ['balanced']
}

In [32]:
lgbm_grid = RandomizedSearchCV(
    LGBMClassifier(random_state=42, verbose=-1),
    param_distributions=lgbm_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)


In [33]:
lgbm_grid.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


0,1,2
,estimator,"LGBMClassifie...2, verbose=-1)"
,param_distributions,"{'class_weight': ['balanced'], 'colsample_bytree': [0.8, 1.0], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5, ...], ...}"
,n_iter,15
,scoring,"make_scorer(f...ct', beta=2.0)"
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,7
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [34]:
print(f"Best params: {lgbm_grid.best_params_}")
print(f"Best F2 score (Beta=2): {lgbm_grid.best_score_:.4f}\n")

Best params: {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'num_leaves': 50, 'n_estimators': 100, 'min_child_samples': 20, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'class_weight': 'balanced'}
Best F2 score (Beta=2): 0.4999



## Model Comparison

In [43]:
models = {
    'Logistic Regression': lr_grid,
    'Random Forest': rf_grid,
    'XGBoost': xgb_grid,
    'Gradient Boosting': gb_grid,
    'LightGBM': lgbm_grid
}

metric_results = []

for name, grid in models.items():
    clf = grid.best_estimator_
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    roc = roc_auc_score(y_test, y_proba)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    metric_results.append({
        "Model": name,
        "F2 (Beta=2)": f2,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "ROC_AUC": roc,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "TP": tp
    })

metrics_df = pd.DataFrame(metric_results).sort_values("ROC_AUC", ascending=False)
print(metrics_df.to_string(index=False))



              Model  F2 (Beta=2)  Accuracy  Precision   Recall       F1  ROC_AUC     TN     FP    FN    TP
           LightGBM     0.499198  0.656897   0.229461 0.706959 0.346467 0.744190 255889 138086 17045 41121
            XGBoost     0.499059  0.446002   0.177658 0.911151 0.297340 0.743335 148658 245317  5168 52998
  Gradient Boosting     0.018313  0.871609   0.535915 0.014751 0.028711 0.741331 393232    743 57308   858
      Random Forest     0.000021  0.871357   1.000000 0.000017 0.000034 0.715172 393975      0 58165     1
Logistic Regression     0.472486  0.593268   0.199566 0.717945 0.312318 0.707672 226481 167494 16406 41760


In [44]:
from sklearn.metrics import average_precision_score

In [45]:
for name, grid in models.items():
    clf = grid.best_estimator_
    y_proba = clf.predict_proba(X_test)[:, 1]
    pr_auc = average_precision_score(y_test, y_proba)
    print(f"{name} PR-AUC: {pr_auc:.4f}")

Logistic Regression PR-AUC: 0.2586
Random Forest PR-AUC: 0.2728
XGBoost PR-AUC: 0.2920
Gradient Boosting PR-AUC: 0.2915




LightGBM PR-AUC: 0.2920


In [46]:
# Ejemplo: FN cuesta 10x más que FP
cost_fn = 10
cost_fp = 1

for name in metrics_df['Model']:
    row = metrics_df[metrics_df['Model'] == name].iloc[0]
    total_cost = row['FN'] * cost_fn + row['FP'] * cost_fp
    print(f"{name}: Costo total = {total_cost:,.0f}")


LightGBM: Costo total = 308,536
XGBoost: Costo total = 296,997
Gradient Boosting: Costo total = 573,823
Random Forest: Costo total = 581,650
Logistic Regression: Costo total = 331,554


In [None]:
# Save the best model
best_model_name = metrics_df.iloc[0]['Model']
best_model = models[best_model_name].best_estimator_

joblib.dump(best_model, 'models/best_model.pkl')
print(f"\n Best model {best_model_name} is saved")