In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from scipy.stats import kstest
from scipy.stats import norm
from sklearn.pipeline import Pipeline
#model modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

In [3]:
orig_data=pd.read_csv("datasets/loan_data.csv")

In [4]:
orig_data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [7]:
orig_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [8]:
numeric_cols={}
nonnumeric_cols={}
for i in orig_data.columns:
    if isinstance(orig_data[i][0], str):
        labeler=LabelEncoder()
        labels=labeler.fit_transform(orig_data[i])
        nonnumeric_cols[i]=labels
    else:
        numeric_cols[i]=orig_data[i]

In [9]:
dataset=pd.concat([pd.DataFrame.from_dict(nonnumeric_cols), pd.DataFrame.from_dict(numeric_cols)], axis=1)

In [10]:
#inspect multicollinearity
dataset.corr().where(abs(dataset.corr())>.79)

Unnamed: 0,person_gender,person_education,person_home_ownership,loan_intent,previous_loan_defaults_on_file,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
person_gender,1.0,,,,,,,,,,,,,
person_education,,1.0,,,,,,,,,,,,
person_home_ownership,,,1.0,,,,,,,,,,,
loan_intent,,,,1.0,,,,,,,,,,
previous_loan_defaults_on_file,,,,,1.0,,,,,,,,,
person_age,,,,,,1.0,,0.954412,,,,0.861985,,
person_income,,,,,,,1.0,,,,,,,
person_emp_exp,,,,,,0.954412,,1.0,,,,0.824272,,
loan_amnt,,,,,,,,,1.0,,,,,
loan_int_rate,,,,,,,,,,1.0,,,,


In [11]:
dataset.corr()

Unnamed: 0,person_gender,person_education,person_home_ownership,loan_intent,previous_loan_defaults_on_file,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
person_gender,1.0,-0.002267,-0.001172,-0.003107,0.000704,0.017528,0.010173,0.015078,0.010647,0.003662,-0.000367,0.011999,-0.000223,-0.000561
person_education,-0.002267,1.0,-0.00029,0.004356,-0.002669,-0.014649,0.000331,-0.012034,-0.002441,0.000336,-0.006984,-0.014576,0.012498,-0.001747
person_home_ownership,-0.001172,-0.00029,1.0,0.001828,-0.125974,-0.039008,-0.21987,-0.035884,-0.150243,0.130437,0.148933,-0.029549,-0.006421,0.233842
loan_intent,-0.003107,0.004356,0.001828,1.0,0.038469,0.032005,0.008639,0.03123,-0.001068,-0.005372,0.003269,0.030912,0.016244,-0.072158
previous_loan_defaults_on_file,0.000704,-0.002669,-0.125974,0.038469,1.0,-0.025917,0.061483,-0.029231,-0.059009,-0.1818,-0.203252,-0.02264,-0.183005,-0.543096
person_age,0.017528,-0.014649,-0.039008,0.032005,-0.025917,1.0,0.193698,0.954412,0.05075,0.013402,-0.043299,0.861985,0.178432,-0.021476
person_income,0.010173,0.000331,-0.21987,0.008639,0.061483,0.193698,1.0,0.185987,0.24229,0.00151,-0.234177,0.124316,0.035919,-0.135808
person_emp_exp,0.015078,-0.012034,-0.035884,0.03123,-0.029231,0.954412,0.185987,1.0,0.044589,0.016631,-0.039862,0.824272,0.186196,-0.020481
loan_amnt,0.010647,-0.002441,-0.150243,-0.001068,-0.059009,0.05075,0.24229,0.044589,1.0,0.146093,0.593011,0.042969,0.009074,0.107714
loan_int_rate,0.003662,0.000336,0.130437,-0.005372,-0.1818,0.013402,0.00151,0.016631,0.146093,1.0,0.125209,0.018008,0.011498,0.332005


In [12]:
dataset=dataset.drop(["person_emp_exp", "cb_person_cred_hist_length"], axis=1)

In [16]:
#test for normality
for i in dataset.columns:
    stat, p = kstest(dataset[i], 'norm', args=(dataset[i].mean(), dataset[i].std()))
    print(f'Statistic={stat:.3f}, p-value={p:.3f}')

Statistic=0.368, p-value=0.000
Statistic=0.258, p-value=0.000
Statistic=0.338, p-value=0.000
Statistic=0.174, p-value=0.000
Statistic=0.345, p-value=0.000
Statistic=0.155, p-value=0.000
Statistic=0.214, p-value=0.000
Statistic=0.135, p-value=0.000
Statistic=0.067, p-value=0.000
Statistic=0.106, p-value=0.000
Statistic=0.059, p-value=0.000
Statistic=0.481, p-value=0.000


In [17]:
#Standardize X values
scaler=StandardScaler()
scaled_dataset=pd.DataFrame(scaler.fit_transform(dataset))
scaled_dataset.columns=list(dataset.columns)

In [18]:
X=scaled_dataset.drop("loan_status", axis=1)
y=dataset["loan_status"]

In [21]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.30, random_state=42)

In [22]:
def classifier_scoring (X_train, X_test, y_train, y_test):
	classifiers = [
	RandomForestClassifier(random_state=42),
	RidgeClassifier(random_state=42),
	GradientBoostingClassifier(random_state=42),
	LogisticRegression(random_state=42)
	]
	
	scores=[]
	for i in classifiers:
		pipe = Pipeline(steps=[('classifier', i)])
		pi=pipe.fit(X_train, y_train)
		scores.append(cross_val_score(pi, X_test, y_test, cv=5,
		verbose=True).mean())
	return ["Random Forest Score {}".format(scores[0]), "RidgeClassifier Score {}".format(scores[1]), "Gradient Boosting Score {}".format(scores[2]),
	"Logistic Regression Score {}".format(scores[3])]

In [23]:
#get best classifiers by score results. 
#raise exception if number of classifiers do not match number of score results
try:
	score_results=classifier_scoring(X_train, X_test, y_train, y_test)
	print("Best Scoring Classifier: {}".format(sorted(score_results, reverse=False)[0]))
except IndexError:
	print("No Results. Number of classifers need to match number of scores to return!")
else:
	pass

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.1s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


Best Scoring Classifier: Gradient Boosting Score 0.9191851851851853


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.0s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [32]:
gb=GradientBoostingClassifier(n_estimators=200, max_depth=3, min_samples_split=2,
                              min_samples_leaf=1, subsample=.9, random_state=42)

In [33]:
gb.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,0.9
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [34]:
predictions=gb.predict(X_test)

In [35]:
accuracy_score(predictions, y_test)

0.9262222222222222

In [36]:
import joblib
joblib.dump(gb, 'loandata.pkl')

['loandata.pkl']