In [1]:
# Dataset
from sklearn import datasets

# Data processing
import pandas as pd
import numpy as np

# Standardize the data
from sklearn.preprocessing import StandardScaler

# Model and performace evaluation
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval

%reload_ext watermark
%watermark -iv -v

Python implementation: CPython
Python version       : 3.12.2
IPython version      : 8.21.0

numpy   : 1.26.4
pandas  : 2.2.0
sklearn : 1.4.1.post1
hyperopt: 0.2.7



In [2]:
# Dicionário dos Meses.

meses = {1:  'Jan',
         2:  'Fev',
         3:  'Mar',
         4:  'Abr',
         5:  'Mai',
         6:  'Jun',
         7:  'Jul',
         8:  'Ago',
         9:  'Set',
         10: 'Out',
         11: 'Nov',
         12: 'Dez'}

In [10]:
# Load the breast cancer dataset
df = pd.read_feather('./Dados/credit_scoring.ftr')

# Tratando os Dados

df.drop(columns='index', inplace=True)
df.set_index(keys='data_ref', inplace=True)
df.fillna({'tempo_emprego': -1}, inplace=True)
df.index = df.index.month.map(meses) + '_' + df.index.year.astype(str)

df.replace({'tipo_renda': {'Bolsista': 'Servidor_Bolsista', 'Servidor público': 'Servidor_Bolsista'}}, inplace=True)
df.replace({'estado_civil': {'Solteiro': 'Solteiro_Separado_Viúvo', 'Separado': 'Solteiro_Separado_Viúvo', 'Viúvo': 'Solteiro_Separado_Viúvo', 'Casado': 'Casado/União', 'União': 'Casado/União'}}, inplace=True)
df.replace({'educacao': {'Fundamental':'Fundamental_Médio', 'Médio': 'Fundamental_Médio', 'Pós graduação': 'Superior_Pós', 'Superior completo': 'Superior_Pós', 'Superior incompleto': 'Superior_Pós'}}, inplace=True)
df.replace({'qt_pessoas_residencia': {7: 6}}, inplace=True)

# Check the data information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, Jan_2015 to Mar_2016
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   sexo                   750000 non-null  object 
 1   posse_de_veiculo       750000 non-null  object 
 2   posse_de_imovel        750000 non-null  object 
 3   qtd_filhos             750000 non-null  int64  
 4   tipo_renda             750000 non-null  object 
 5   educacao               750000 non-null  object 
 6   estado_civil           750000 non-null  object 
 7   tipo_residencia        750000 non-null  object 
 8   idade                  750000 non-null  int64  
 9   tempo_emprego          750000 non-null  float64
 10  qt_pessoas_residencia  750000 non-null  float64
 11  renda                  750000 non-null  float64
 12  mau                    750000 non-null  bool   
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 75.1+ MB


In [7]:
# Check the target value distribution
df['mau'].value_counts(normalize=True)

mau
False    0.921809
True     0.078191
Name: proportion, dtype: float64

In [17]:
df = pd.get_dummies(df, dtype=int)

In [18]:
#Definindo o DataFrame de Treino e de Test(Validação).

df_train = df.loc[~df.index.isin(['Jan_2016', 'Fev_2016', 'Mar_2016'])]
df_test = df.loc[df.index.isin(['Jan_2016', 'Fev_2016', 'Mar_2016'])]

X_train = df_train[['renda', 'tempo_emprego', 'idade', 'posse_de_imovel_S']]
y_train = df_train['mau']

X_test = df_test[['renda', 'tempo_emprego', 'idade', 'posse_de_imovel_S']]
y_test = df_test['mau']

In [19]:
# Check the number of records in training and testing dataset
print(f'The training dataset has {len(X_train)} records.')
print(f'The training dataset has {len(X_test)} records.')

The training dataset has 600000 records.
The training dataset has 150000 records.


In [20]:
# Initiate scaler
sc = StandardScaler()

# StandardScaler the training dataset
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

# StandardScaler the testing dataset
X_test_transformed = pd.DataFrame(sc.fit_transform(X_test), index=X_test.index, columns=X_test.columns)

# Summary statistics after standardization
X_train_transformed.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
renda,600000.0,8.952838e-17,1.000001,-0.309701,-0.2581,-0.198394,-0.064021,46.427848
tempo_emprego,600000.0,-1.389348e-16,1.000001,-1.05127,-0.736229,-0.229629,0.403423,5.291518
idade,600000.0,4.015751e-17,1.000001,-1.940986,-0.872481,-0.071103,0.819318,2.154948
posse_de_imovel_S,600000.0,7.896498000000001e-17,1.000001,-1.430566,-1.430566,0.699024,0.699024,0.699024


In [21]:
# Summary statistics before standardization
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
renda,600000.0,27222.844732,87377.875997,161.86,4670.6,9887.58,21628.87,4083986.0
tempo_emprego,600000.0,6.277236,6.922332,-1.0,1.180822,4.687671,9.069863,42.90685
idade,600000.0,43.79853,11.230659,22.0,34.0,43.0,53.0,68.0
posse_de_imovel_S,600000.0,0.671757,0.469574,0.0,0.0,1.0,1.0,1.0


In [22]:
# Initiate XGBoost Classifier
xgboost = XGBClassifier()

# Print default setting
xgboost.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [23]:
# Train the model
xgboost = XGBClassifier(seed=0).fit(X_train_transformed, y_train)

# Make prediction
xgboost_predict = xgboost.predict(X_test_transformed)

# Get predicted probability
xgboost_predict_prob = xgboost.predict_proba(X_test_transformed)[:,1]

In [24]:
# Get performance metrics
precision, recall, fscore, support = score(y_test, xgboost_predict)

# Print result
print(f'The recall value for the baseline xgboost model is {recall[1]:.2%}')

The recall value for the baseline xgboost model is 7.66%


In [25]:
# Define the search space
param_grid = {
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5, 0.8 ],
    # reg_alpha provides 11 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [ 0, 0.5, 1, 5 ],
    # reg_lambda provides 12 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [ 0, 0.5, 1, 5]
}

# Set up score
scoring = ['recall']

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [63]:
# Define grid search
grid_search = GridSearchCV(estimator=xgboost,
                           param_grid=param_grid,
                           scoring=scoring,
                           refit='recall',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)

# Fit grid search
grid_result = grid_search.fit(X_train_transformed, y_train)

# Print grid search summary
grid_result

In [27]:
# Print the best score and the corresponding hyperparameters
print(f'The best score is {grid_result.best_score_:.4f}')
print(f'The best score standard deviation is {grid_result.cv_results_['std_test_recall'][grid_result.best_index_]:.4f}')
print(f'The best hyperparameters are {grid_result.best_params_}')

The best score is 0.0081
The best score standard deviation is 0.0010
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 1}


In [28]:
# Make prediction using the best model
grid_predict = grid_search.predict(X_test_transformed)

# Get predicted probabilities
grid_predict_prob = grid_search.predict_proba(X_test_transformed)[:,1]

# Get performance metrics
precision, recall, fscore, support = score(y_test, grid_predict)

# Print result
print(f'The recall value for the xgboost grid search is {recall[1]:.2%}')

The recall value for the xgboost grid search is 7.14%


In [62]:
# Define the search space
param_grid = {
    # Learning rate shrinks the weights to make the boosting process more conservative
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    # Maximum depth of the tree, increasing it increases the model complexity.
    "max_depth": range(3, 31, 3),
    # Gamma specifies the minumum loss reduction required to make a split.
    "gamma": [i/10.0 for i in range(0,5)],
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [i/10.0 for i in range(3,10)],
    # reg_alpha provides 11 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [ 1e-5, 1e-2, 0.1, 1, 10, 100 ],
    # reg_lambda provides 12 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [ 1e-5, 1e-2, 0.1, 1, 10, 100 ]}

# Set up score
scoring = ['recall']

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [59]:
# Define random search
random_search = RandomizedSearchCV(estimator=xgboost,
                                   param_distributions=param_grid,
                                   n_iter=48,
                                   scoring=scoring,
                                   refit='recall',
                                   n_jobs=-1,
                                   cv=kfold,
                                   verbose=0)

# Fit grid search
random_result = random_search.fit(X_train_transformed, y_train)

# Print grid search summary
random_result

In [60]:
# Print the best score and the corresponding hyperparameters
print(f'The best score is {random_result.best_score_:.4%}')
print(f'The best score standard deviation is {random_result.cv_results_['std_test_recall'][random_result.best_index_]:.2f}')
print(f'The best hyperparameters are {random_result.best_params_}')

The best score is 2.8955%
The best score standard deviation is 0.00
The best hyperparameters are {'reg_lambda': 0.1, 'reg_alpha': 1e-05, 'max_depth': 30, 'learning_rate': 1, 'gamma': 0.4, 'colsample_bytree': 0.7}


In [61]:
# Make prediction using the best model
random_predict = random_search.predict(X_test_transformed)

# Get predicted probabilities
random_predict_prob = random_search.predict_proba(X_test_transformed)[:,1]

# Get performance metrics
precision, recall, fscore, support = score(y_test, random_predict)

# Print result
print(f'The recall value for the xgboost grid search is {recall[1]:.2%}')

The recall value for the xgboost grid search is 10.25%
