# Finetunning do XGBoost

In [197]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [157]:
training_dataset = "data/raw/loan_data_large.csv"
test_dataset = "data/raw/loan_data_test.csv"

In [158]:
df = pd.read_csv(training_dataset)

In [159]:
df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01 00:00:00,61,40100,607,Employed,High School,39,15442,24,Widowed,...,3341.666667,0.895582,7,3561,0.186942,0.214766,797.117108,0.375895,0,50.0
1,2018-01-02 00:00:00,45,196328,662,Employed,Master,21,18447,108,Widowed,...,16360.666667,0.828974,8,8735,0.232447,0.185958,352.913372,0.037707,1,44.8
2,2018-01-03 00:00:00,41,69929,500,Employed,Master,14,55248,24,Divorced,...,5827.416667,0.883624,6,6961,0.280248,0.288369,3056.092148,0.658798,0,57.0
3,2018-01-04 00:00:00,18,178352,576,Employed,Doctorate,0,23964,60,Single,...,7870.833333,0.923136,4,580260,0.240964,0.225815,669.806992,0.10924,1,32.8
4,2018-01-05 00:00:00,36,69982,588,Self-Employed,Associate,9,15280,36,Single,...,5831.833333,0.847818,6,70933,0.20628,0.204616,571.459726,0.151146,1,47.2


In [160]:
df.shape

(100000, 36)

In [161]:
def clean_dataset(dataset, features_to_remove):
    dataset = dataset.drop(columns=features_to_remove)
    dataset = dataset.dropna()
    dataset['EducationLevel'] = dataset['EducationLevel'].astype('category').cat.codes
    dataset['EmploymentStatus'] = dataset['EmploymentStatus'].astype('category').cat.codes
    dataset['HomeOwnershipStatus'] = dataset['HomeOwnershipStatus'].astype('category').cat.codes
    return dataset

In [162]:
columns_to_remove = ['ApplicationDate', 'LoanPurpose', 'MaritalStatus', 'LoanApproved']
df = clean_dataset(df, columns_to_remove)
df

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,NumberOfDependents,HomeOwnershipStatus,...,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,RiskScore
0,61,40100,607,0,3,39,15442,24,3,1,...,17841,3341.666667,0.895582,7,3561,0.186942,0.214766,797.117108,0.375895,50.0
1,45,196328,662,0,4,21,18447,108,0,0,...,28617,16360.666667,0.828974,8,8735,0.232447,0.185958,352.913372,0.037707,44.8
2,41,69929,500,0,4,14,55248,24,2,0,...,41550,5827.416667,0.883624,6,6961,0.280248,0.288369,3056.092148,0.658798,57.0
3,18,178352,576,0,2,0,23964,60,0,0,...,95768,7870.833333,0.923136,4,580260,0.240964,0.225815,669.806992,0.109240,32.8
4,36,69982,588,1,0,9,15280,36,3,3,...,29677,5831.833333,0.847818,6,70933,0.206280,0.204616,571.459726,0.151146,47.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,43,35498,457,0,3,22,22207,48,1,0,...,104602,2958.166667,0.964905,4,94710,0.288707,0.315079,819.178548,0.428028,54.0
99996,46,61922,508,0,3,21,10362,96,0,0,...,7607,5160.166667,0.884803,7,81172,0.291362,0.289410,278.132832,0.131223,56.0
99997,44,124415,596,0,1,23,12101,48,2,2,...,61567,10367.916667,0.782392,5,6594,0.209101,0.250921,401.856400,0.096245,40.8
99998,51,41873,623,0,0,31,19587,36,1,3,...,10506,3489.416667,0.714821,7,42662,0.193087,0.182292,710.371293,0.316778,50.0


In [163]:
X = df.drop(columns=['RiskScore'], axis=1)
y = df['RiskScore']

In [164]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Otimizando o modelo

In [165]:
xgboost_model = XGBRegressor()

In [166]:
# número de folds
k = 5
mape_scores = []
kf = KFold(n_splits=k, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X):
  x_train, x_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  xgboost_model.fit(x_train, y_train)
  y_pred = xgboost_model.predict(x_val)

  mape = mean_absolute_percentage_error(y_val, y_pred)
  mape_scores.append(mape)

In [167]:
mape_scores

[0.029783314730107417,
 0.030529115814823524,
 0.030803860765414096,
 0.030527437453361454,
 0.030294396830117528]

Os valores estão muito próximos para cada fold, o que é um bom sinal. Vamos calcular a média:

In [168]:
mape_mean = np.mean(mape_scores)
print(f'MAPE médio: {mape_mean}')

MAPE médio: 0.030387625118764805


### Separando 10% para o teste final

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [170]:
xgboost_model = XGBRegressor()

In [171]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
mape_scores = []

for train_index, val_index in kf.split(X_train, y_train):
  x_train, x_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  xgboost_model.fit(x_train, y_train)
  y_pred = xgboost_model.predict(x_val)

  mape = mean_absolute_percentage_error(y_val, y_pred)
  mape_scores.append(mape)

In [172]:
print(mape_scores)
mape_mean = np.mean(mape_scores)
print(f'MAPE médio: {mape_mean}')

[0.030723913846194047, 0.03057872799735719, 0.03100711062851216, 0.030425204561300517, 0.030340627784394662]
MAPE médio: 0.030615116963551715


In [173]:
y_pred_2 = xgboost_model.predict(X_test)

In [174]:
mape_test = mean_absolute_percentage_error(y_test, y_pred_2)
mape_test

0.025054074384214507

In [175]:
def evaluate_real_data(model):
  df_test = pd.read_csv(test_dataset)
  df_test = clean_dataset(df_test, columns_to_remove)
  X = df_test.drop(columns=['RiskScore'])
  y = df_test['RiskScore']

  X = scaler.transform(X)

  _, X_test, _, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
  y_pred = xgboost_model.predict(X_test)

  mse = mean_squared_error(y_test, y_pred, squared=True)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, y_pred)
  mape = mean_absolute_percentage_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

In [176]:
evaluate_real_data(xgboost_model)

MSE: 4.883951054623298
RMSE: 2.2099663016940547
MAE: 1.3907879536240189
MAPE: 0.029663829557368064
R2 Score: 0.9178399889708132




# Ajuste de hiperparâmetros

In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [200]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

In [204]:
# Create the XGBoost model object
xgboost = XGBRegressor()

In [205]:
# Create the GridSearchCV object
grid_search = GridSearchCV(xgboost, param_grid)

In [206]:
grid_search.fit(X_train, y_train)

In [210]:
df_results = pd.DataFrame(grid_search.cv_results_)
df_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.160108,0.008334,0.002872,0.000273,0.1,3,0.5,"{'learning_rate': 0.1, 'max_depth': 3, 'subsam...",0.888224,0.884006,0.884161,0.889264,0.886034,0.886338,0.002116,7
1,0.188361,0.021718,0.003103,0.000306,0.1,3,0.7,"{'learning_rate': 0.1, 'max_depth': 3, 'subsam...",0.887929,0.884122,0.882055,0.889146,0.886055,0.885861,0.002554,8
2,0.14691,0.004803,0.002731,0.000289,0.1,3,1.0,"{'learning_rate': 0.1, 'max_depth': 3, 'subsam...",0.887731,0.883178,0.883282,0.887513,0.884344,0.885209,0.002013,9
3,0.229782,0.00795,0.00396,0.000248,0.1,5,0.5,"{'learning_rate': 0.1, 'max_depth': 5, 'subsam...",0.910807,0.906254,0.906458,0.911531,0.909783,0.908967,0.002203,4
4,0.233514,0.005438,0.003825,0.000309,0.1,5,0.7,"{'learning_rate': 0.1, 'max_depth': 5, 'subsam...",0.909619,0.905396,0.906814,0.910654,0.908835,0.908263,0.001908,5
5,0.22056,0.002652,0.004002,0.00035,0.1,5,1.0,"{'learning_rate': 0.1, 'max_depth': 5, 'subsam...",0.909857,0.905064,0.904813,0.910581,0.908367,0.907736,0.002395,6
6,0.41043,0.029995,0.005363,0.00028,0.1,7,0.5,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.918714,0.914188,0.91556,0.918931,0.917637,0.917006,0.001847,2
7,0.388246,0.010018,0.005795,0.000276,0.1,7,0.7,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.91873,0.91443,0.917114,0.91967,0.91817,0.917623,0.001798,1
8,0.422529,0.024613,0.005356,0.000233,0.1,7,1.0,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.917836,0.914661,0.914811,0.919955,0.91691,0.916834,0.001978,3
9,0.171753,0.01111,0.002532,0.000402,0.01,3,0.5,"{'learning_rate': 0.01, 'max_depth': 3, 'subsa...",0.470845,0.470607,0.463832,0.469873,0.463457,0.467723,0.003348,16


In [211]:
df_results.query('rank_test_score == 1')['params']

7    {'learning_rate': 0.1, 'max_depth': 7, 'subsam...
Name: params, dtype: object