## Import the libraries

In [39]:
# Data manipulation
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    fbeta_score,
    make_scorer
)

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score


#### Load the saved data

In [40]:
data = joblib.load("data/processed/modeling_data.joblib")

X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]
feature_names = data["feature_names"]

### General Grid Search Config

In [41]:
f2_scorer = make_scorer(fbeta_score, beta=2.0)

In [42]:
cv = 5
scoring = f2_scorer
n_jobs = -1
n_iter = 15

## Logistic Regression

In [44]:
lr_params ={
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced'],
    'max_iter': [1000],
    'l1_ratio': [0.5]
}

In [45]:
lr_grid = RandomizedSearchCV(
    LogisticRegression(random_state=42),
    param_distributions=lr_params,
    scoring=scoring,
    n_iter=48,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [46]:
lr_grid.fit(X_train, y_train)



Fitting 5 folds for each of 30 candidates, totalling 150 fits


25 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jose_\OneDrive\Escritorio\Proyecto Credit Risk\credit-risk-modeling\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jose_\OneDrive\Escritorio\Proyecto Credit Risk\credit-risk-modeling\.venv\Lib\site-packages\sklearn\base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\jose_\OneDrive\Escritorio\Proyecto Credit Risk\credit-risk-modeling\.venv\Lib\site-packages

Best params: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 1000, 'l1_ratio': 0.5, 'class_weight': 'balanced', 'C': 10}
Best ROC-AUC: 0.4736



In [51]:
print(f"Best params: {lr_grid.best_params_}")
print(f"Best F2 score (Beta=2): {lr_grid.best_score_:.4f}\n")

Best params: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 1000, 'l1_ratio': 0.5, 'class_weight': 'balanced', 'C': 10}
Best F2 score (Beta=2): 0.4736



## Random Forest

In [7]:
rf_params = {
    'n_estimators': [100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

In [8]:
rf_grid = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=rf_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [9]:
rf_grid.fit(X_train, y_train)
print(f"Best params: {rf_grid.best_params_}")
print(f"Best ROC-AUC: {rf_grid.best_score_:.4f}\n")

Fitting 3 folds for each of 15 candidates, totalling 45 fits


KeyboardInterrupt: 

## XGBoost

In [47]:
ratio = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class Ratio: {ratio:.4f}")

xgb_params = {
    'n_estimators': [100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.5],
    'scale_pos_weight': [ratio, ratio*2, ratio*3]
}

Class Ratio: 6.7734


In [48]:
xgb_grid = RandomizedSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    param_distributions=xgb_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [49]:
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best params: {'subsample': 0.8, 'scale_pos_weight': np.float64(6.773369838520423), 'n_estimators': 100, 'min_child_weight': 10, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best ROC-AUC: 0.4978



In [50]:
print(f"Best params: {xgb_grid.best_params_}")
print(f"Best F2 score (Beta=2): {xgb_grid.best_score_:.4f}\n")

Best params: {'subsample': 0.8, 'scale_pos_weight': np.float64(6.773369838520423), 'n_estimators': 100, 'min_child_weight': 10, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best F2 score (Beta=2): 0.4978



## Gradient Boosting

In [13]:
gb_params = {
    'n_estimators': [100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0]
}


In [14]:
gb_grid = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_distributions=gb_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)

In [15]:
gb_grid.fit(X_train, y_train)
print(f"Best params: {gb_grid.best_params_}")
print(f"Best F2 score (Beta=2): {gb_grid.best_score_:.4f}\n")

Fitting 3 folds for each of 15 candidates, totalling 45 fits


KeyboardInterrupt: 

## LightGBM

In [52]:
lgbm_params = {
    'n_estimators': [100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [30, 50, 70],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0],
    'class_weight': ['balanced']
}

In [53]:
lgbm_grid = RandomizedSearchCV(
    LGBMClassifier(random_state=42, verbose=-1),
    param_distributions=lgbm_params,
    scoring=scoring,
    n_iter=n_iter,
    cv=cv,
    n_jobs=n_jobs,
    verbose=1
)


In [54]:
lgbm_grid.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


0,1,2
,"estimator  estimator: estimator object An object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed.","LGBMClassifie...2, verbose=-1)"
,"param_distributions  param_distributions: dict or list of dicts Dictionary with parameters names (`str`) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. If a list of dicts is given, first a dict is sampled uniformly, and then a parameter is sampled using that dict as above.","{'class_weight': ['balanced'], 'colsample_bytree': [0.8, 1.0], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5, ...], ...}"
,"n_iter  n_iter: int, default=10 Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.",15
,"scoring  scoring: str, callable, list, tuple or dict, default=None Strategy to evaluate the performance of the cross-validated model on the test set. If `scoring` represents a single score, one can use: - a single string (see :ref:`scoring_string_names`); - a callable (see :ref:`scoring_callable`) that returns a single value; - `None`, the `estimator`'s  :ref:`default evaluation criterion ` is used. If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric  names and the values are the metric scores; - a dictionary with metric names as keys and callables as values. See :ref:`multimetric_grid_search` for an example. If None, the estimator's score method is used.","make_scorer(f...ct', beta=2.0)"
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionchanged:: v0.20  `n_jobs` default changed from 1 to None",-1
,"refit  refit: bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given the ``cv_results_``. In that case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this ``RandomizedSearchCV`` instance. Also for multiple metric evaluation, the attributes ``best_index_``, ``best_score_`` and ``best_params_`` will only be available if ``refit`` is set and all of them will be determined w.r.t this specific scorer. See ``scoring`` parameter to know more about multiple metric evaluation. See :ref:`this example ` for an example of how to use ``refit=callable`` to balance model complexity and cross-validated score. .. versionchanged:: 0.20  Support for callable added.",True
,"cv  cv: int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",5
,"verbose  verbose: int Controls the verbosity: the higher, the more messages. - >1 : the computation time for each fold and parameter candidate is  displayed; - >2 : the score is also displayed; - >3 : the fold and candidate parameter indexes are also displayed  together with the starting time of the computation.",1
,"pre_dispatch  pre_dispatch: int, or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use  this for lightweight and fast-running jobs, to avoid delays due to on-demand  spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'",'2*n_jobs'
,"random_state  random_state: int, RandomState instance or None, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,7
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [55]:
print(f"Best params: {lgbm_grid.best_params_}")
print(f"Best F2 score (Beta=2): {lgbm_grid.best_score_:.4f}\n")

Best params: {'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0, 'num_leaves': 50, 'n_estimators': 100, 'min_child_samples': 50, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'class_weight': 'balanced'}
Best F2 score (Beta=2): 0.4995



## Model Comparison

In [58]:
models = {
    'Logistic Regression': lr_grid,
    'XGBoost': xgb_grid,
    'LightGBM': lgbm_grid
}

metric_results = []

for name, grid in models.items():
    clf = grid.best_estimator_
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    roc = roc_auc_score(y_test, y_proba)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    metric_results.append({
        "Model": name,
        "F2 (Beta=2)": f2,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "ROC_AUC": roc,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "TP": tp
    })

metrics_df = pd.DataFrame(metric_results).sort_values("ROC_AUC", ascending=False)
print(metrics_df.to_string(index=False))





              Model  F2 (Beta=2)  Accuracy  Precision   Recall       F1  ROC_AUC     TN     FP    FN    TP
           LightGBM     0.499646  0.658600   0.230300 0.706100 0.347319 0.744085 256709 137266 17095 41071
            XGBoost     0.497893  0.655470   0.228475 0.706031 0.345231 0.743355 255298 138677 17099 41067
Logistic Regression     0.472486  0.593268   0.199566 0.717945 0.312318 0.707672 226481 167494 16406 41760


In [31]:
from sklearn.metrics import average_precision_score

In [59]:
for name, grid in models.items():
    clf = grid.best_estimator_
    y_proba = clf.predict_proba(X_test)[:, 1]
    pr_auc = average_precision_score(y_test, y_proba)
    print(f"{name} PR-AUC: {pr_auc:.4f}")

Logistic Regression PR-AUC: 0.2586
XGBoost PR-AUC: 0.2919




LightGBM PR-AUC: 0.2924


In [60]:
# Ejemplo: FN cuesta 10x más que FP
cost_fn = 10
cost_fp = 1

for name in metrics_df['Model']:
    row = metrics_df[metrics_df['Model'] == name].iloc[0]
    total_cost = row['FN'] * cost_fn + row['FP'] * cost_fp
    print(f"{name}: Costo total = {total_cost:,.0f}")


LightGBM: Costo total = 308,216
XGBoost: Costo total = 309,667
Logistic Regression: Costo total = 331,554


In [None]:
# Save the best model
best_model_name = metrics_df.iloc[0]['Model']
best_model = models[best_model_name].best_estimator_

joblib.dump(best_model, 'models/best_model.pkl')
print(f"\n Best model {best_model_name} is saved")