# Finetunning do XGBoost

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import scipy.stats as stats

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [3]:
training_dataset = "data/raw/loan_data_large.csv"
test_dataset = "data/raw/loan_data_test.csv"

In [5]:
df = pd.read_csv(training_dataset)

In [21]:
pd.options.display.max_columns = None
df.loc[df['RiskScore'] == 82.0]

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,CreditCardUtilizationRate,NumberOfOpenCreditLines,NumberOfCreditInquiries,DebtToIncomeRatio,BankruptcyHistory,LoanPurpose,PreviousLoanDefaults,PaymentHistory,LengthOfCreditHistory,SavingsAccountBalance,CheckingAccountBalance,TotalAssets,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
21234,2076-02-20 00:00:00,52,59482,498,Self-Employed,Bachelor,27,55615,60,Single,0,Mortgage,426,0.518088,0,1,0.540737,1,Home,1,24,5,827,9340,75638,73726,4956.833333,0.904017,5,1912,0.311615,0.285198,1749.099325,0.438808,0,82.0
58743,2178-11-01 00:00:00,29,19310,571,Employed,High School,5,24426,60,Married,3,Rent,420,0.382693,3,1,0.6374,1,Home,1,29,1,2999,461,12115,24546,1609.166667,0.747331,6,4658,0.243926,0.220153,674.832146,0.680372,0,82.0


In [102]:
df.shape

(100000, 36)

In [103]:
def clean_dataset(dataset, features_to_remove):
    dataset = dataset.drop(columns=features_to_remove)
    dataset = dataset.dropna()
    dataset['EducationLevel'] = dataset['EducationLevel'].astype('category').cat.codes
    dataset['EmploymentStatus'] = dataset['EmploymentStatus'].astype('category').cat.codes
    dataset['HomeOwnershipStatus'] = dataset['HomeOwnershipStatus'].astype('category').cat.codes
    return dataset

In [104]:
columns_to_remove = ['ApplicationDate', 'LoanPurpose', 'MaritalStatus', 'LoanApproved', 'CreditCardUtilizationRate']
df = clean_dataset(df, columns_to_remove)


19

In [106]:
df['JobTenure'].unique()

array([ 7,  8,  6,  4,  5,  1,  3,  9,  2, 10, 12,  0, 11, 13, 15, 17, 14,
       16, 22])

In [52]:
X = df.drop(columns=['RiskScore'], axis=1)
y = df['RiskScore']

In [53]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Otimizando o modelo

In [54]:
xgboost_model = XGBRegressor()

In [55]:
# número de folds
k = 5
mape_scores = []
kf = KFold(n_splits=k, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X):
  x_train, x_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  xgboost_model.fit(x_train, y_train)
  y_pred = xgboost_model.predict(x_val)

  mape = mean_absolute_percentage_error(y_val, y_pred)
  mape_scores.append(mape)

In [56]:
mape_scores

[0.03410368425942694,
 0.034466199832982235,
 0.03499306452660847,
 0.03436880971112577,
 0.03448691258773315]

Os valores estão muito próximos para cada fold, o que é um bom sinal. Vamos calcular a média:

In [57]:
mape_mean = np.mean(mape_scores)
print(f'MAPE médio: {mape_mean}')

MAPE médio: 0.03448373418357532


### Separando 10% para o teste final

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [59]:
xgboost_model = XGBRegressor()

In [60]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
mape_scores = []

for train_index, val_index in kf.split(X_train, y_train):
  x_train, x_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  xgboost_model.fit(x_train, y_train)
  y_pred = xgboost_model.predict(x_val)

  mape = mean_absolute_percentage_error(y_val, y_pred)
  mape_scores.append(mape)

In [61]:
print(mape_scores)
mape_mean = np.mean(mape_scores)
print(f'MAPE médio: {mape_mean}')

[0.03476204375292032, 0.03458391978204043, 0.03484683194015244, 0.03468081104802604, 0.034442971482559315]
MAPE médio: 0.03466331560113971


In [62]:
y_pred_2 = xgboost_model.predict(X_test)

In [63]:
mape_test = mean_absolute_percentage_error(y_test, y_pred_2)
mape_test

0.02905169010368337

In [64]:
def evaluate_real_data(model):
  df_test = pd.read_csv(test_dataset)
  df_test = clean_dataset(df_test, columns_to_remove)
  X = df_test.drop(columns=['RiskScore'])
  y = df_test['RiskScore']

  X = scaler.transform(X)

  _, X_test, _, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
  y_pred = xgboost_model.predict(X_test)

  mse = mean_squared_error(y_test, y_pred, squared=True)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, y_pred)
  mape = mean_absolute_percentage_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

In [65]:
evaluate_real_data(xgboost_model)

MSE: 5.525745966072004
RMSE: 2.3506905296257106
MAE: 1.6182307867262096
MAPE: 0.03398557930384665
R2 Score: 0.9070434276594164




# Ajuste de hiperparâmetros

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## GridSearch

In [67]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

In [68]:
# Create the XGBoost model object
xgboost = XGBRegressor()

In [69]:
# Create the GridSearchCV object
grid_search = GridSearchCV(xgboost, param_grid)

In [70]:
grid_search.fit(X_train, y_train)

In [71]:
df_results = pd.DataFrame(grid_search.cv_results_)
df_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.208002,0.024082,0.003227,0.000597,0.1,3,0.5,"{'learning_rate': 0.1, 'max_depth': 3, 'subsam...",0.881482,0.877443,0.876749,0.881352,0.878235,0.879052,0.001988,8
1,0.213776,0.030264,0.003766,0.000425,0.1,3,0.7,"{'learning_rate': 0.1, 'max_depth': 3, 'subsam...",0.881657,0.878325,0.874499,0.881489,0.879541,0.879102,0.002616,7
2,0.2165,0.024947,0.003819,0.001137,0.1,3,1.0,"{'learning_rate': 0.1, 'max_depth': 3, 'subsam...",0.880983,0.876708,0.875156,0.879506,0.877938,0.878058,0.002045,9
3,0.293513,0.021668,0.00682,0.005159,0.1,5,0.5,"{'learning_rate': 0.1, 'max_depth': 5, 'subsam...",0.903015,0.899024,0.898123,0.902564,0.901384,0.900822,0.001933,4
4,0.270904,0.018367,0.004094,0.00048,0.1,5,0.7,"{'learning_rate': 0.1, 'max_depth': 5, 'subsam...",0.901853,0.897485,0.897456,0.900949,0.900232,0.899595,0.001809,5
5,0.261319,0.010522,0.004318,0.000378,0.1,5,1.0,"{'learning_rate': 0.1, 'max_depth': 5, 'subsam...",0.901712,0.897555,0.896932,0.901433,0.900012,0.899529,0.001963,6
6,0.413592,0.007335,0.005992,0.000222,0.1,7,0.5,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.909621,0.906684,0.906478,0.909767,0.90898,0.908306,0.001434,2
7,0.431431,0.009416,0.005652,0.000224,0.1,7,0.7,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.909689,0.907052,0.907438,0.909823,0.909361,0.908673,0.001182,1
8,0.442722,0.038303,0.006357,0.000692,0.1,7,1.0,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.908267,0.906597,0.905988,0.909741,0.9082,0.907758,0.001332,3
9,0.193832,0.016738,0.003226,0.000746,0.01,3,0.5,"{'learning_rate': 0.01, 'max_depth': 3, 'subsa...",0.470845,0.470607,0.463832,0.469873,0.463457,0.467723,0.003348,16


In [72]:
df_results.query('rank_test_score == 1')['params']

7    {'learning_rate': 0.1, 'max_depth': 7, 'subsam...
Name: params, dtype: object

In [73]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.7}

## RandomSearch

In [74]:
param_dist = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.01, 0.1),
    'subsample': stats.uniform(0.5, 0.5),
    'n_estimators':stats.randint(50, 200)
}

In [75]:
random_search = RandomizedSearchCV(xgboost, param_dist, random_state=42)

In [76]:
random_search.fit(X_train, y_train)

In [77]:
# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Best set of hyperparameters:  {'learning_rate': 0.06986584841970366, 'max_depth': 9, 'n_estimators': 171, 'subsample': 0.5779972601681014}
Best score:  0.9132259452581447


## Validação cruzada com 10 folds

In [78]:
random_search = RandomizedSearchCV(xgboost, param_dist, cv=10, random_state=42)
random_search.fit(X_train, y_train)

print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Best set of hyperparameters:  {'learning_rate': 0.06986584841970366, 'max_depth': 9, 'n_estimators': 171, 'subsample': 0.5779972601681014}
Best score:  0.9142250870765996
