In [1]:
import pickle
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import numpy as np

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

##### Leer los datos de entrenamiento limpios previamente guardados

In [2]:
# Leer los datos de entrenamiento desde el archivo pickle
with open("files/datos_entrenamiento.pkl", "rb") as archivo:
    datos_entrenamiento = pickle.load(archivo)

X = datos_entrenamiento['X']
y = datos_entrenamiento['y']

In [3]:
# Dividir el dataset en entrenamiento y prueba (20% prueba, 80% entrenamiento)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)

##### Buscar los mejores Hiperparámetros para un modelo XGB

In [4]:
# Distribución de los hiperparámetros a ajustar para XGBoost
param_dist = {
        'n_estimators' : [200,250,300,350,400], # no of trres
        'learning_rate' : [0.01],
        'min_child_weight': [4],
        'gamma': [0.5,0.7,1.0,1.3],
        'colsample_bytree': [0.5],
        'subsample' : [0.3],
        'max_depth': [10],
        'learning_rate' : [0.01], 
        'reg_lambda' : [0.6],
        'objective' :['binary:logistic'],
        'eval_metric':['auc']
        }

#folds = 5
#param_comb = 500

# Modelo base XGBoost     
xgb_model = xgb.XGBClassifier(n_jobs = -1)

# Validación cruzada
cv = 3

random_search_xgb = RandomizedSearchCV(
        estimator=xgb_model, 
        param_distributions=param_dist, 
        n_iter=500, 
        scoring='roc_auc', 
        refit = 'roc_auc',
        n_jobs=-1, 
        cv=cv, 
        verbose=3, 
        random_state=42)

random_search_xgb.fit(X_train, y_train)



Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [5]:
# Mostrar los mejores hiperparámetros encontrados para XGBoost
print(random_search_xgb.best_params_)

{'subsample': 0.3, 'reg_lambda': 0.6, 'objective': 'binary:logistic', 'n_estimators': 400, 'min_child_weight': 4, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1.3, 'eval_metric': 'auc', 'colsample_bytree': 0.5}


In [6]:
# Mostrar la mejor puntuación obtenida con esos hiperparámetros para XGBoost
print(random_search_xgb.best_score_)

0.7227176389662323


In [8]:
# Mostrar el mejor estimador
print(random_search_xgb.best_estimator_)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              feature_weights=None, gamma=1.3, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.01, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
              max_leaves=None, min_child_weight=4, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=400,
              n_jobs=-1, num_parallel_tree=None, ...)


##### Buscar los mejores Hiperparámetros para un modelo LGBM

In [9]:
# Distribución de los hiperparámetros a ajustar para LGBMClassifier
param_dist = {
            'n_estimators' : np.arange(20,100,5),
            'feature_fraction' : np.arange(0.1,1,0.1),
            'bagging_fraction' : np.arange(0.5,0.99,0.05),
            'num_leaves' : np.arange(30,100,10),
            'learning_rate' : np.arange(0.01,0.5,0.05),
            'max_depth' : np.arange(10,100,5),
            'min_child_samples' : np.arange(2,7,1),
            'reg_alpha' : np.arange(0,1,0.2),
            'reg_lambda' : np.arange(0,1,0.2),
            'colsample_bytree' : np.arange(0,1,0.2)
}

# Modelo base LGBM
lgbm = LGBMClassifier(random_state=42, n_jobs=1)

# Validación cruzada
cv = 3 

# RandomizedSearchCV
random_search_lgbm = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=50,
    scoring='roc_auc',
    cv=cv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search_lgbm.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 356743, number of negative: 356975
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2006
[LightGBM] [Info] Number of data points in the train set: 713718, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499837 -> initscore=-0.000650
[LightGBM] [Info] Start training from score -0.000650


In [10]:
# Mostrar los mejores hiperparámetros encontrados para LGBMClassifier
print(random_search_lgbm.best_params_)

{'reg_lambda': 0.6000000000000001, 'reg_alpha': 0.8, 'num_leaves': 60, 'n_estimators': 90, 'min_child_samples': 2, 'max_depth': 70, 'learning_rate': 0.21000000000000002, 'feature_fraction': 0.5, 'colsample_bytree': 0.6000000000000001, 'bagging_fraction': 0.6500000000000001}


In [11]:
# Mostrar la mejor puntuación obtenida con esos hiperparámetros para LGBMClassifier
print(random_search_lgbm.best_score_)

0.7270139486757293


In [12]:
# Mostrar el mejor estimador para LGBMClassifier
print(random_search_lgbm.best_estimator_)

LGBMClassifier(bagging_fraction=0.6500000000000001,
               colsample_bytree=0.6000000000000001, feature_fraction=0.5,
               learning_rate=0.21000000000000002, max_depth=70,
               min_child_samples=2, n_estimators=90, n_jobs=1, num_leaves=60,
               random_state=42, reg_alpha=0.8, reg_lambda=0.6000000000000001)
