In [54]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [27]:
# Load in data
train_df_X = pd.read_csv(Path('Resources/2019loans.csv'))
test_df_y = pd.read_csv(Path('Resources/2020Q1loans.csv'))
print(train_df_X.columns)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [28]:
print(test_df_y.columns)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [29]:
train_df_X.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [30]:
test_df_y.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [32]:
# drop the unname column
train_df_X = train_df_X.drop('Unnamed: 0', axis = 1).set_index('index')
test_df_y = test_df_y.drop('Unnamed: 0', axis = 1).set_index('index')

In [33]:
train_df_X.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


#### Convert categorical data to numeric and separate target feature for training data

In [44]:
# Convert categorical data to numeric and separate target feature for training data
# drop the 'loan_status' to create train data
X_train = train_df_X.drop('loan_status', axis=1)
# One-hot encoding the entire dataframe
X_train_dummies = pd.get_dummies(X_train)
# convert the output lables to 0 and 1
y_train_label = LabelEncoder().fit_transform(train_df_X['loan_status'])
y_train_label

array([1, 1, 1, ..., 0, 0, 0])

### Convert categorical data to numeric and separate target feature for testing data

In [45]:
# Convert categorical data to numeric and separate target feature for testing data
# drop the 'loan_status' to create train data
X_test = test_df_y.drop('loan_status', axis=1)
# One-hot encoding the entire dataframe
X_test_dummies = pd.get_dummies(X_test)
# convert the output lables to 0 and 1
y_test_label = LabelEncoder().fit_transform(test_df_y['loan_status'])
y_test_label

array([1, 1, 1, ..., 0, 0, 0])

In [46]:
# add missing dummy variables to testing set
# borrowed from https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data
# get missing columns in the training test
missing_columns = set(X_train_dummies.columns) - set(X_test_dummies.columns)
# add a missing column in test set with default value equal to 0
for c in missing_columns:
    X_test_dummies[c] = 0
X_test_dummies = X_test_dummies[X_train_dummies.columns]

### Logistic Regression Model

In [47]:
# Train the Logistic Regression model on the unscaled data and print the model score
# create logistic model
classifier = LogisticRegression()
# Fit train data into the model by using training data
classifier.fit(X_train_dummies, y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [48]:
# # Make the predictions
print(f"Training Data Score: {classifier.score(X_train_dummies, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_dummies, y_test_label)}")

Training Data Score: 0.6507389162561577
Testing Data Score: 0.5161633347511697


In [49]:
# create DF
predictions = classifier.predict(X_test_dummies)
pd.DataFrame({"Prediction": predictions, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,0,0
4698,1,0
4699,1,0
4700,1,0


### Train a Random Forest Classifier model and print the model score

In [55]:
# Train a Random Forest Classifier model and print the model score
classifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_dummies, y_train_label)
print(f'Training Score: {classifier.score(X_train_dummies, y_train_label)}')
print(f'Testing Score: {classifier.score(X_test_dummies, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6433432581880051


In [57]:
# Scale the data
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [59]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression().fit(X_train_scaled, y_train_label)
print(f'Training Score: {classifier.score(X_train_scaled, y_train_label)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test_label)}')

Training Score: 0.7078817733990148
Testing Score: 0.767333049766057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
# Train a Random Forest Classifier model on the scaled data and print the model score
classifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train_label)
print(f'Training Score: {classifier.score(X_train_scaled, y_train_label)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6420672054444917
