In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

Formatting The Data

In [2]:
df = pd.read_csv("loan_data_ex.csv")
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No,1
1,21,female,High School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes,0
2,25,female,High School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No,1
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No,1
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No,1


In [3]:
df['person_education'] = df['person_education'].replace(' ','_',regex=True)
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No,1
1,21,female,High_School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes,0
2,25,female,High_School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No,1
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No,1
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No,1


In [4]:
df.dtypes

person_age                          int64
person_gender                      object
person_education                   object
person_income                       int64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                           int64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length          int64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object

In [5]:
inputs = df.drop('loan_status',axis=1).copy()
inputs.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No
1,21,female,High_School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes
2,25,female,High_School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No


In [6]:
output = df['loan_status'].copy()
output.head()

0    1
1    0
2    1
3    1
4    1
Name: loan_status, dtype: int64

In [7]:
inputs_encoded = pd.get_dummies(inputs, columns=['person_gender',
                                       'person_education',
                                       'person_home_ownership',
                                       'loan_intent',
                                       'previous_loan_defaults_on_file'],dtype=int)
inputs_encoded.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,person_gender_female,person_gender_male,...,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_No,previous_loan_defaults_on_file_Yes
0,22,71948,0,35000,16.02,0.49,3,561,1,0,...,0,1,0,0,0,0,1,0,1,0
1,21,12282,0,1000,11.14,0.08,2,504,1,0,...,1,0,0,1,0,0,0,0,0,1
2,25,12438,3,5500,12.87,0.44,3,635,1,0,...,0,0,0,0,0,1,0,0,1,0
3,23,79753,0,35000,15.23,0.44,2,675,1,0,...,0,1,0,0,0,1,0,0,1,0
4,24,66135,1,35000,14.27,0.53,4,586,0,1,...,0,1,0,0,0,1,0,0,1,0


In [8]:
output.unique()

array([1, 0])

Now Building The Actual ML Model

In [19]:
sum(output)/len(output)

0.2454952712792434

In [10]:
#24.5% of people got loans

In [11]:
inputs_train, inputs_test, output_train, output_test = train_test_split(inputs_encoded, output, random_state = 0, stratify = output)

In [12]:
sum(output_train)/len(output_train)

0.2454533386433028

In [13]:
sum(output_test)/len(output_test)

0.24562101910828024

In [14]:
datatrain = xgb.DMatrix(inputs_train, label=output_train)
datatest = xgb.DMatrix(inputs_test, label=output_test)

parameters = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'seed': 0
}

model = xgb.train(parameters,datatrain,num_boost_round=1000,evals=[(datatest, 'eval')],early_stopping_rounds=10,verbose_eval=True)

[0]	eval-aucpr:0.89118
[1]	eval-aucpr:0.89984
[2]	eval-aucpr:0.90700
[3]	eval-aucpr:0.91046
[4]	eval-aucpr:0.90987
[5]	eval-aucpr:0.91580
[6]	eval-aucpr:0.91744
[7]	eval-aucpr:0.91943
[8]	eval-aucpr:0.91991
[9]	eval-aucpr:0.92141
[10]	eval-aucpr:0.92253
[11]	eval-aucpr:0.92267
[12]	eval-aucpr:0.92320
[13]	eval-aucpr:0.92217
[14]	eval-aucpr:0.92294
[15]	eval-aucpr:0.92394
[16]	eval-aucpr:0.92415
[17]	eval-aucpr:0.92365
[18]	eval-aucpr:0.92356
[19]	eval-aucpr:0.92396
[20]	eval-aucpr:0.92422
[21]	eval-aucpr:0.92326
[22]	eval-aucpr:0.92258
[23]	eval-aucpr:0.92234
[24]	eval-aucpr:0.92117
[25]	eval-aucpr:0.92168
[26]	eval-aucpr:0.92164
[27]	eval-aucpr:0.92234
[28]	eval-aucpr:0.92358
[29]	eval-aucpr:0.92381


In [18]:
output_probability = model.predict(datatest)
output_prediction = np.where(output_probability >= 0.5, 1, 0)
output_prediction

array([0, 0, 0, ..., 0, 1, 1], shape=(2512,))