In [46]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [47]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [48]:
train_df.head()


Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,0,...,100.0,50.0,0,0,577150,122018,32000,170200,N,N
1,21000,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,2,...,85.0,33.3,0,0,132750,27896,15900,35398,N,N
2,20000,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,0,...,85.7,33.3,0,0,628160,114043,22600,90340,N,N
3,3000,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,0,...,100.0,16.7,1,0,42006,20761,19900,15406,N,N
4,30000,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,0,...,100.0,66.7,0,0,283248,109056,79500,58778,N,N


In [49]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,19.75,0,...,97.7,0.0,0,0,527975,70914,74600,99475,N,N
1,6000,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,11.52,2,...,66.7,0.0,0,0,34628,23460,5900,23628,N,N
2,3600,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,6.74,0,...,100.0,0.0,0,0,23100,19183,7300,15000,N,N
3,20000,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,12.13,0,...,100.0,50.0,0,0,56481,43817,13800,35981,N,N
4,3600,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,16.08,0,...,100.0,25.0,0,0,45977,32448,21000,24977,N,N


In [50]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df["loan_status"]
x_train = train_df.drop(columns = ["loan_status"])
x_train = pd.get_dummies(x_train)
x_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375,0.1797,483.34,223000.0,29.99,0,0,15,0,39728,...,0,1,0,1,1,0,1,0,1,0
1,21000,0.1308,478.68,123000.0,11.26,2,0,16,0,9585,...,0,1,0,1,1,0,1,0,1,0
2,20000,0.124,448.95,197000.0,11.28,0,0,12,0,16708,...,0,1,0,1,1,0,1,0,1,0
3,3000,0.124,100.22,45000.0,18.08,0,0,12,1,8809,...,0,1,0,1,1,0,1,0,1,0
4,30000,0.1612,1056.49,133000.0,27.77,0,2,13,0,65420,...,0,1,0,1,1,0,1,0,1,0


In [51]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df["loan_status"]
x_test = test_df.drop(columns = ["loan_status"])
x_test = pd.get_dummies(x_test)
x_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000,0.0819,814.7,140000.0,19.75,0,1,18,0,9471,...,0,0,1,0,1,1,0,1,0,1
1,6000,0.1524,208.7,55000.0,11.52,2,0,8,0,1280,...,0,0,1,0,1,1,0,1,0,1
2,3600,0.1695,128.27,42000.0,6.74,0,0,6,0,4757,...,0,0,1,0,1,1,0,1,0,1
3,20000,0.1524,478.33,100000.0,12.13,0,2,7,0,12731,...,0,0,1,0,1,1,0,1,0,1
4,3600,0.124,120.27,50000.0,16.08,0,3,6,0,10413,...,0,0,1,0,1,1,0,1,0,1


In [52]:
# add missing dummy variables to testing set
for column in x_train.columns:
    if column not in x_test.columns:
        x_test[column] = 0

# MODEL PREDICTION:

I beleive the Random Forest Classifier model will perform better than the Logistic Regression overall. The Random Forest works with smaller subsets of data to build a strong classifier, while the Logistic Regression uses the dataset as a whole for its predictions

In [53]:
# Train the Logistic Regression model on the unscaled data and print the model score
reg = LogisticRegression().fit(x_train, y_train)
reg.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5168013611229264

In [54]:
# Train a Random Forest Classifier model and print the model score
randomforestclass = RandomForestClassifier(random_state=0)
randomforestclass.fit(x_train, y_train)
print("RandomForestClassifier score: ", randomforestclass.score(x_test, y_test))

RandomForestClassifier score:  0.6424925563589962


In [55]:
# Scale the data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

# SCALED MODEL PREDICITON:

I expect that the Linear Regression model will improve with the scaled data, but I would still expect the Random Forest model to perform better, as scaling may allow it to find more accurate trees with the subsets of data.

In [56]:
# Train the Logistic Regression model on the scaled data and print the model score
LogisticRegression_scaled = LogisticRegression(
    solver='lbfgs',
    max_iter=100,
    random_state=0
)
LogisticRegression_scaled.fit(x_train_scaled, y_train)
print("LogisticRegression scaled score: ", LogisticRegression_scaled.score(x_test_scaled, y_test))

LogisticRegression scaled score:  0.767333049766057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
# Train a Random Forest Classifier model on the scaled data and print the model score
randomforestclass_scaled = RandomForestClassifier(random_state=0)
randomforestclass_scaled.fit(x_train_scaled, y_train)
print("RandomForestClassifier scaled score: ", randomforestclass_scaled.score(x_test_scaled, y_test))

RandomForestClassifier scaled score:  0.6437686091025095


# MODEL RESULTS:
The Random Forest model performed better with the un-scaled data than the Logistic Regression model, however the Logistic Regression model with the scaled data did the best as a whole. The Random Forest model did about the same with both the scaled and unscaled data which I didn't expect. 