In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [3]:
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [4]:
# Convert categorical data to numeric and separate target feature for training data
# One-hot encoding
X_train = train_df.drop('loan_status', axis=1)
X_train_dummies = pd.get_dummies(X_train, drop_first=True)
y_train = LabelEncoder().fit_transform(train_df['loan_status'])

In [5]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop('loan_status', axis=1)
X_test_dummies = pd.get_dummies(X_test, drop_first=True)
y_test = LabelEncoder().fit_transform(test_df['loan_status'])

In [6]:
# add missing dummy variables to testing set

missing_cols = set(X_train_dummies.columns) - set(X_test_dummies.columns)

for col in missing_cols:
    X_test_dummies[col] = 0

X_test_dummies = X_test_dummies[X_train_dummies.columns]

**Prediction:**
I think Random Forest Classifier model will give a more acurate result, because this model uses decision trees that try to select the best feauture at every split.

In [20]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier_LR = LogisticRegression().fit(X_train_dummies, y_train)

print(f"Training Data Score: {classifier_LR.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {classifier_LR.score(X_test_dummies, y_test)}")

Training Data Score: 0.6497536945812807
Testing Data Score: 0.5204168438962143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Train a Random Forest Classifier model and print the model score
classifier_RF = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_dummies, y_train)

print(f"Training Data Score: {classifier_RF.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {classifier_RF.score(X_test_dummies, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.635048915355168


**Prediction:** 
I think scaling will give more accurate result for Logistic Regression model, because this model is sensitive to the range of the data points. I don't think the result will change that much for Random Forests as this model is not sensitive to the the variance in the data. <br>
(Scaling is mostly needed to be done for distance based algorithms. For tree based algorithms, scaling is not required.)

In [7]:
# Scale the data
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [8]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier_LR = LogisticRegression()

classifier_LR.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier_LR.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_LR.score(X_test_scaled, y_test)}")

Training Data Score: 0.713136288998358
Testing Data Score: 0.7233092301148447


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Train a Random Forest Classifier model on the scaled data and print the model score
classifier_RF = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier_RF.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_RF.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6335601871544024


**How do the model scores compare to each other, and to the previous results on unscaled data? How does this compare to your prediction?**<br><br>
Scaling helped the Logistics Regression model to get a better score, but did not have an impact on the Random Forest Classifier model. For this dataset, the scaled Logistic Regression model gives a better result. My prediction for unscaled data was correct -- Random Forest Classifier is better than Logistic Regression (when not scaled). <br>
As for the scaled data, I also predicted correctly that the scaling would improve the Logistic Regression score and not so much the Random Forest. <br>
I wasn't trully sure about which model will show a better result for this data (it's hard to say ahead, you just have to test different models to see). Even though scaled Logistic Regression showed a better score, I think more exploratory analysis is needed here.