In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [119]:
training_dataset = "data/raw/loan_data_large.csv"
test_dataset = "data/raw/loan_data_test.csv"

In [120]:
df = pd.read_csv(training_dataset)

In [121]:
df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01 00:00:00,61,40100,607,Employed,High School,39,15442,24,Widowed,...,3341.666667,0.895582,7,3561,0.186942,0.214766,797.117108,0.375895,0,50.0
1,2018-01-02 00:00:00,45,196328,662,Employed,Master,21,18447,108,Widowed,...,16360.666667,0.828974,8,8735,0.232447,0.185958,352.913372,0.037707,1,44.8
2,2018-01-03 00:00:00,41,69929,500,Employed,Master,14,55248,24,Divorced,...,5827.416667,0.883624,6,6961,0.280248,0.288369,3056.092148,0.658798,0,57.0
3,2018-01-04 00:00:00,18,178352,576,Employed,Doctorate,0,23964,60,Single,...,7870.833333,0.923136,4,580260,0.240964,0.225815,669.806992,0.10924,1,32.8
4,2018-01-05 00:00:00,36,69982,588,Self-Employed,Associate,9,15280,36,Single,...,5831.833333,0.847818,6,70933,0.20628,0.204616,571.459726,0.151146,1,47.2


In [122]:
df.shape

(100000, 36)

# Limpando os dados

In [123]:
def limpar_dados(dataset, features_to_remove):
    dataset = dataset.drop(columns=features_to_remove)
    dataset = dataset.dropna()
    dataset['EducationLevel'] = dataset['EducationLevel'].astype('category').cat.codes
    dataset['EmploymentStatus'] = dataset['EmploymentStatus'].astype('category').cat.codes
    dataset['HomeOwnershipStatus'] = dataset['HomeOwnershipStatus'].astype('category').cat.codes
    return dataset

# Ignorando colunas `ApplicationDate`, `LoanPurpose`, `MaritalStatus` e `LoanApproved`

In [124]:
columns_to_remove = ['ApplicationDate', 'LoanPurpose', 'MaritalStatus', 'LoanApproved']

In [125]:
df_tratado = limpar_dados(df, columns_to_remove)

# Splitar dados

In [126]:
X = df_tratado.drop(columns=['RiskScore'])
y = df_tratado['RiskScore']

In [127]:
X

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,NumberOfDependents,HomeOwnershipStatus,...,TotalAssets,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio
0,61,40100,607,0,3,39,15442,24,3,1,...,21402,17841,3341.666667,0.895582,7,3561,0.186942,0.214766,797.117108,0.375895
1,45,196328,662,0,4,21,18447,108,0,0,...,37352,28617,16360.666667,0.828974,8,8735,0.232447,0.185958,352.913372,0.037707
2,41,69929,500,0,4,14,55248,24,2,0,...,12213,41550,5827.416667,0.883624,6,6961,0.280248,0.288369,3056.092148,0.658798
3,18,178352,576,0,2,0,23964,60,0,0,...,676028,95768,7870.833333,0.923136,4,580260,0.240964,0.225815,669.806992,0.109240
4,36,69982,588,1,0,9,15280,36,3,3,...,100610,29677,5831.833333,0.847818,6,70933,0.206280,0.204616,571.459726,0.151146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,43,35498,457,0,3,22,22207,48,1,0,...,199312,104602,2958.166667,0.964905,4,94710,0.288707,0.315079,819.178548,0.428028
99996,46,61922,508,0,3,21,10362,96,0,0,...,88779,7607,5160.166667,0.884803,7,81172,0.291362,0.289410,278.132832,0.131223
99997,44,124415,596,0,1,23,12101,48,2,2,...,68161,61567,10367.916667,0.782392,5,6594,0.209101,0.250921,401.856400,0.096245
99998,51,41873,623,0,0,31,19587,36,1,3,...,53168,10506,3489.416667,0.714821,7,42662,0.193087,0.182292,710.371293,0.316778


In [128]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [129]:
X_scaled

array([[ 1.84404301, -0.47168191,  0.69927757, ..., -0.57896041,
        -0.18130509, -0.08424702],
       [ 0.46259898,  3.42933659,  1.76695871, ..., -1.25797643,
        -0.84490473, -1.05815399],
       [ 0.11723797,  0.27314924, -1.37784756, ...,  1.15584087,
         3.19339646,  0.73045033],
       ...,
       [ 0.37625873,  1.63366652,  0.48574134, ...,  0.27319678,
        -0.77178834, -0.88957893],
       [ 0.98064049, -0.42741003,  1.00987572, ..., -1.34436847,
        -0.31089538, -0.25449125],
       [-0.31446329, -0.1120385 , -0.75665126, ...,  1.00961294,
         0.23569528, -0.16415857]])

y

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Testing Linear Regression

In [131]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

y_pred = linear_regression_model.predict(X_test)

In [132]:
mse = mean_squared_error(y_test, y_pred, squared=True)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

MSE: 13.492314830829692
RMSE: 3.673188646234997
MAE: 2.8552312420715027
MAPE: 0.06045850610686629
R2 Score: 0.774389170664245




# K-NN regressor

In [133]:
knn_model = KNeighborsRegressor(n_neighbors=7, metric='euclidean')
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, squared=True)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

MSE: 14.212926106122447
RMSE: 3.770003462348867
MAE: 2.8479977142857145
MAPE: 0.05949460148662034
R2 Score: 0.7623395179926361




# Support Vector Machines for Regressor (SVR)

In [134]:
#model = SVR(kernel='linear', C=100)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)

#mse = mean_squared_error(y_test, y_pred, squared=True)
#rmse = np.sqrt(mse)
#mae = mean_absolute_error(y_test, y_pred)
#mape = mean_absolute_percentage_error(y_test, y_pred)
#print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}')

# Árvore de regressão

In [135]:
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)
y_pred = decision_tree_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, squared=True)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

MSE: 13.844943999999996
RMSE: 3.720879465932751
MAE: 1.7318400000000003
MAPE: 0.036679308086056794
R2 Score: 0.7684927058765494




# XGBoost

In [136]:
xgboost_model = XGBRegressor()
xgboost_model.fit(X_train, y_train)
y_pred = xgboost_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, squared=True)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

MSE: 5.128910534207009
RMSE: 2.264709812361621
MAE: 1.433706327453613
MAPE: 0.03048534771600232
R2 Score: 0.9142372696071919




# Random Forest Regressor

In [137]:
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, squared=True)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

MSE: 6.16053017376
RMSE: 2.482041533447819
MAE: 1.3671993599999999
MAPE: 0.029138602402523774
R2 Score: 0.8969871116204552




# Testar com outros dados

In [138]:
df_test = pd.read_csv(test_dataset)

In [139]:
df_test = limpar_dados(df_test, columns_to_remove)

In [140]:
df_test

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,NumberOfDependents,HomeOwnershipStatus,...,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,RiskScore
0,36,17021,581,0,0,12,7332,60,4,0,...,5848,1418.416667,0.911812,9,58171,0.221832,0.223311,203.884443,0.259363,54.0
1,41,49421,559,0,4,16,15713,60,3,0,...,21838,4118.416667,0.926866,7,81104,0.241213,0.227904,441.066918,0.318342,45.6
2,43,30529,587,0,3,21,63870,48,0,3,...,105717,2544.083333,0.929530,5,1541,0.265370,0.298448,2293.941651,1.155993,51.0
3,28,50218,592,0,4,8,31051,24,2,2,...,51145,4184.833333,0.953527,8,8247,0.210051,0.168041,1532.310234,0.523631,59.0
4,46,79758,595,0,3,25,16971,48,0,2,...,11461,6646.500000,0.849103,10,9773,0.214471,0.221393,535.974330,0.096588,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,26,65143,588,0,1,5,69762,48,0,2,...,31537,5428.583333,0.822208,8,28429,0.270762,0.246649,2300.088670,0.506778,55.0
29996,19,55780,555,0,0,0,28513,84,3,0,...,2623,4648.333333,0.428198,3,100442,0.276013,0.270429,759.366093,0.221879,48.0
29997,18,37733,549,0,1,0,18899,36,3,3,...,9155,3144.416667,0.799783,1,51123,0.229399,0.245787,747.215800,0.332404,67.0
29998,18,27579,479,0,4,0,7787,108,0,0,...,47294,2298.250000,0.931287,6,1219,0.313287,0.250630,182.179595,0.258971,58.0


In [141]:
X = df_test.drop(columns=['RiskScore'])
y = df_test['RiskScore']
X_scaled = scaler.transform(X)

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.9, random_state=42)

## Testando com XGBoost

In [144]:
y_pred = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred, squared=True)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nMAPE: {mape}\nR2 Score: {r2}')

MSE: 4.858985213502815
RMSE: 2.2043105982376474
MAE: 1.3965230373523851
MAPE: 0.029764359735733638
R2 Score: 0.9182599755265487




# Otimizando modelo

## KFold

In [147]:
# número de folds
k = 5
mape_scores = []
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [148]:
for train_index, val_index in kf.split(X_scaled):
  x_train, x_val = X_scaled[train_index], X_scaled[val_index]
  y_train, y_val = y[train_index], y[val_index]

  xgboost_model.fit(x_train, y_train)
  y_pred = xgboost_model.predict(x_val)

  mape = mean_absolute_percentage_error(y_val, y_pred)
  mape_scores.append(mape)

In [149]:
mape_scores

[0.032321898364028025,
 0.03163458280303649,
 0.03307262301588185,
 0.03246669336038574,
 0.031864119226151955]