In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import pickle

In [2]:
df=pd.read_csv("../resources/leftovers_to_test.csv")

In [3]:
retypes={'state_code':'str'
,'loan_type':'str'
,'loan_amount_000s':'int64'
,'action_taken':'int8'
,'applicant_ethnicity':'str'
,'co_applicant_ethnicity':'str'
,'applicant_race_1':'str'
,'co_applicant_race_1':'str'
,'applicant_sex':'str'
,'co_applicant_sex':'str'
,'applicant_income_000s':'int64'
}

In [4]:
# drop state code 78- not a FIPS code we have in our dataset nor is it included in the training and initial test of our models
df=df[df['state_code']!=78]

In [5]:
df = df.astype(retypes)

In [6]:
X = df.drop(columns=['action_taken','Unnamed: 0'])
X

Unnamed: 0,loan_type,loan_amount_000s,state_code,applicant_ethnicity,co_applicant_ethnicity,applicant_race_1,co_applicant_race_1,applicant_sex,co_applicant_sex,applicant_income_000s
0,1,144,2,2,2,5,5,1,2,154
1,1,300,2,2,5,5,8,1,5,119
2,1,263,2,1,5,5,8,1,5,141
3,1,187,2,2,5,5,8,1,5,76
4,1,361,2,2,2,1,5,2,1,107
...,...,...,...,...,...,...,...,...,...,...
14258661,1,76,4,2,2,5,5,2,1,80
14258662,1,195,48,2,2,5,5,2,1,45
14258663,3,480,51,2,2,5,5,2,1,225
14258664,1,627,24,2,5,2,8,1,5,191


In [7]:
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['loan_amount_000s', 'applicant_income_000s', 'loan_type_1',
       'loan_type_2', 'loan_type_3', 'loan_type_4', 'state_code_1',
       'state_code_10', 'state_code_11', 'state_code_12', 'state_code_13',
       'state_code_15', 'state_code_16', 'state_code_17', 'state_code_18',
       'state_code_19', 'state_code_2', 'state_code_20', 'state_code_21',
       'state_code_22', 'state_code_23', 'state_code_24', 'state_code_25',
       'state_code_26', 'state_code_27', 'state_code_28', 'state_code_29',
       'state_code_30', 'state_code_31', 'state_code_32', 'state_code_33',
       'state_code_34', 'state_code_35', 'state_code_36', 'state_code_37',
       'state_code_38', 'state_code_39', 'state_code_4', 'state_code_40',
       'state_code_41', 'state_code_42', 'state_code_44', 'state_code_45',
       'state_code_46', 'state_code_47', 'state_code_48', 'state_code_49',
       'state_code_5', 'state_code_50', 'state_code_51', 'state_code_53',
       'state_code_54', 'state_code_55', 's

Unnamed: 0,loan_amount_000s,applicant_income_000s,loan_type_1,loan_type_2,loan_type_3,loan_type_4,state_code_1,state_code_10,state_code_11,state_code_12,...,co_applicant_race_1_2,co_applicant_race_1_3,co_applicant_race_1_4,co_applicant_race_1_5,co_applicant_race_1_8,applicant_sex_1,applicant_sex_2,co_applicant_sex_1,co_applicant_sex_2,co_applicant_sex_5
0,144,154,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,300,119,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
2,263,141,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
3,187,76,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
4,361,107,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14258661,76,80,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
14258662,195,45,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
14258663,480,225,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
14258664,627,191,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1


In [8]:
y = df['action_taken']

In [9]:
# Set some individual examples like we'd get from our webform
# High Loan Low Income
form_info_1=[200,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0]
# Low Loan High Income
form_info_2=[10,200,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0]
# In the Middle
form_info_3=[200,100,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0]

In [10]:
# test LogisticRegression vs dataset
loaded_model = pickle.load(open('../resources/lr_classifier.pkl', 'rb'))

lr_result = loaded_model.score(X_dummies, y)
print(lr_result)



0.8907578436756964


In [11]:
# Test Logistic Regression vs extremes
model_input1 = np.array(form_info_1)[np.newaxis, :]
lr_result1=loaded_model.predict(model_input1)
lr_prob1=loaded_model.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
lr_result2=loaded_model.predict(model_input2)
lr_prob2=loaded_model.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
lr_result3=loaded_model.predict(model_input3)
lr_prob3=loaded_model.predict_proba(model_input3)

print(f"Logistic Regression High Loan Low Income: {lr_result1}")
print(f"Logistic Regression Predict Probability: {lr_prob1}")
print(f"Logistic Regression Low Loan High Income: {lr_result2}")
print(f"Logistic Regression Predict Probability: {lr_prob2}")
print(f"Logistic Regression Middle: {lr_result3}")
print(f"Logistic Regression Middle: {lr_prob3}")

Logistic Regression High Loan Low Income: [0]
Logistic Regression Predict Probability: [[0.51601937 0.48398063]]
Logistic Regression Low Loan High Income: [1]
Logistic Regression Predict Probability: [[0. 1.]]
Logistic Regression Middle: [1]
Logistic Regression Middle: [[3.87990307e-09 9.99999996e-01]]


In [12]:
# Test SVC vs dataset
loaded_model3 = pickle.load(open('../resources/svc_classifier.pkl', 'rb'))
svc_result = loaded_model3.score(X_dummies, y)
print(svc_result)



0.8908873790532668


In [13]:
# SVC vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
svc_result1=loaded_model3.predict(model_input1)
# svc_prob1=loaded_model3.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
svc_result2=loaded_model3.predict(model_input2)
# svc_prob2=loaded_model3.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
svc_result3=loaded_model3.predict(model_input3)
# svc_prob3=loaded_model3.predict_proba(model_input3)

print(f"SVC High Loan Low Income: {svc_result1}")
# print(f"SVC Predict Probability: {svc_prob1}")
print(f"SVC Low Loan High Income: {svc_result2}")
# print(f"SVC Predict Probability: {svc_prob2}")
print(f"SVC Middle: {svc_result3}")
# print(f"SVC Middle: {svc_prob3}")


SVC High Loan Low Income: [0]
SVC Low Loan High Income: [1]
SVC Middle: [1]


In [None]:
# Test AdaBoostClassifier vs dataset
loaded_model2 = pickle.load(open('../resources/ab_classifier.pkl', 'rb'))
ab_result = loaded_model2.score(X_dummies, y)
print(ab_result)



In [None]:
# Ada Boost vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
ab_result1=loaded_model2.predict(model_input1)
ab_prob1=loaded_model2.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
ab_result2=loaded_model2.predict(model_input2)
ab_prob2=loaded_model2.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
ab_result3=loaded_model2.predict(model_input3)
ab_prob3=loaded_model2.predict_proba(model_input3)

print(f"Ada Boost High Loan Low Income: {ab_result1}")
print(f"Ada Boost Predict Probability: {ab_prob1}")
print(f"Ada Boost Low Loan High Income: {ab_result2}")
print(f"Ada Boost Predict Probability: {ab_prob2}")
print(f"Ada Boost Middle: {ab_result3}")
print(f"Ada Boost Middle: {ab_prob3}")


In [None]:
# Test Random Forest vs dataset
loaded_model5 = pickle.load(open('../resources/rf_classifier.pkl', 'rb'))
rf_result = loaded_model5.score(X_dummies, y)
print(rf_result)

In [None]:
# Test Random Forest vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
rf_result1=loaded_model5.predict(model_input1)
rf_prob1=loaded_model5.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
rf_result2=loaded_model5.predict(model_input2)
rf_prob2=loaded_model5.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
rf_result3=loaded_model5.predict(model_input3)
rf_prob3=loaded_model5.predict_proba(model_input3)

print(f"Random Forest High Loan Low Income: {rf_result1}")
print(f"Random Forest Predict Probability: {rf_prob1}")
print(f"Random Forest Low Loan High Income: {rf_result2}")
print(f"Random Forest Predict Probability: {rf_prob2}")
print(f"Random Forest Middle: {rf_result3}")
print(f"Random Forest Middle: {rf_prob3}")

In [None]:
# Test KNClassifier vs dataset
loaded_model4 = pickle.load(open('../resources/kn_classifier.pkl', 'rb'))
kn_result = loaded_model4.score(X_dummies, y)
print(kn_result)

In [None]:
# Test KNClassifier vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
kn_result1=loaded_model4.predict(model_input1)
kn_prob1=loaded_model4.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
kn_result2=loaded_model4.predict(model_input2)
kn_prob2=loaded_model4.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
kn_result3=loaded_model4.predict(model_input3)
kn_prob3=loaded_model4.predict_proba(model_input3)

print(f"KNClassifier High Loan Low Income: {kn_result1}")
print(f"KNClassifier Predict Probability: {kn_prob1}")
print(f"KNClassifier Low Loan High Income: {kn_result2}")
print(f"KNClassifier Predict Probability: {kn_prob2}")
print(f"KNClassifier Middle: {kn_result3}")
print(f"KNClassifier Middle: {kn_prob3}")

In [None]:
print(f'Logistic Regression: {lr_result}')
print(f'SVC: {svc_result}')
print(f'K Neighbors: {kn_result}')
print(f'Ada Boost: {ab_result}')
print(f'Random Forest: {rf_result}')


In [None]:
#How much agreement on the middle? 
print(f"Random Forest Middle: {rf_result3}")
print(f"Random Forest Prob Middle: {rf_prob3}")

print(f"KNClassifier Middle: {kn_result3}")
print(f"KNClassifier Prob Middle: {kn_prob3}")

print(f"Ada Boost Middle: {ab_result3}")
print(f"KNClassifier Prob Middle: {ab_prob3}")

print(f"SVC Middle: {svc_result3}")

print(f"Logistic Regression Middle: {lr_result3}")
print(f"Logistic Regression Prob Middle: {lr_prob3}")