In [25]:
import numpy as np
import pandas as pd
from pathlib import Path

RETRIEVING THE DATA

In [26]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

PREPROCESSING: CONVERTING CATEGORICAL DATA TO NUMERICAL DATA

In [27]:
# View train data

train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [28]:
# Convert categorical data to numerical data and separate target feature for training data

train_df = pd.get_dummies(data=train_df, columns=['home_ownership','verification_status','loan_status','pymnt_plan','initial_list_status','application_type','hardship_flag','debt_settlement_flag'])

In [29]:
# Separate targert feature for training

X_train = train_df.drop('loan_status_high_risk', axis=1)
X_train = X_train.drop('loan_status_low_risk',axis=1)
y_train = train_df['loan_status_high_risk']

In [30]:
# View test data

test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [31]:
# Convert categorical data to numerical data and separate target feature for testing data

test_df = pd.get_dummies(data=test_df, columns=['home_ownership','verification_status','loan_status','pymnt_plan','initial_list_status','application_type','hardship_flag','debt_settlement_flag'])

In [33]:
# Separate targert feature for training

X_test = test_df.drop('loan_status_high_risk', axis=1)
X_test = X_test.drop('loan_status_low_risk',axis=1)
y_test = test_df['loan_status_high_risk']

In [34]:
# add missing dummy variables to testing set

X_test['debt_settlement_flag_Y']=0

X_test.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,1,0,1,1,0,1,0,1,0


CONSIDERING THE MODELS

I predict the logisitcs regression model will better perform compared to the random forest classifier model. This predicition is made based upon the noise within the data. There are many categories within the datasets that I believe will not influence risk. The additional noise within the data, will therefore decrease the accurancy of the random forest classifier. 

However, without scaling the data, the random forest classifier could potentially beat the logistics regression model. 

LOGISTIC REGRESSION MODEL AND RANDOM FOREST CLASSIFIER MODEL

In [35]:
# Train the Logistic Regression model on the unscaled data and print the model score

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [36]:
classifier.fit(X_train, y_train)
print(f"Training Score: {classifier.score(X_train, y_train)}")
print(f"Testing Score: {classifier.score(X_test, y_test)}")

Training Score: 0.6485221674876848
Testing Score: 0.5253083794130158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [37]:
# Train a Random Forest Classifier model and print the model score

from sklearn.ensemble import RandomForestClassifier
rfclassifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)

In [38]:
print(f'Training Score: {rfclassifier.score(X_train, y_train)}')
print(f'Testing Score: {rfclassifier.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6154827732879625


RECONSIDERING THE MODELS PRIOR TO SCALING

Random forest classifier appears to have a perfect training score of 1.0, but the testing score is significanlty lower. This gap leads to believe that the random forest classifier is overfitting.

I predict scaling will improve both models. I believe this to be true as unsclaed input variables tend to result in an unstable learning process.

REVISITING THE REPROCESSING: SCALING THE DATA

In [39]:
# Scale the data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

In [40]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [42]:
print(f"Training Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Score: {classifier.score(X_test_scaled, y_test)}")

Training Score: 0.713136288998358
Testing Score: 0.7201190982560612


In [43]:
# Train a Random Forest Classifier model on the scaled data and print the model score

rfclassifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

In [44]:
print(f'Training Score: {rfclassifier.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rfclassifier.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6165461505742237


OVERALL CONSIDERATIONS

While scaling the data improved the logistics regression model, scaling did not have an affect on the random forest classifier. This result could be due the effect scaling may have on important features for predicting credit risk. Scaling could potentially be lowering the bearing of these important features. 

Overall, the logisitics regression model appears to have outperformed the random forest classifier model. The logistics regression model, after scaling, has a training score of 0.71 and a very similar testing score of 0.72.