In [1]:
# dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Retrieve the Data

In [2]:
# read in CSVs with pandas & convert to dfs
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
print(train_df.shape)
print(test_df.shape) 

(12180, 86)
(4702, 86)


In [4]:
# check data types
print(train_df.columns)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [5]:
print(test_df.columns)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [7]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


### Preprocessing: Convert categorical data to numeric

In [9]:
# Convert categorical data to numeric and separate target feature for training data
# drop the 'loan_status' to create train data
x_train = train_df.drop('loan_status', axis=1)
# One-hot encoding the entire dataframe
x_train_dummies = pd.get_dummies(x_train)
# convert the output lables to 0 and 1
y_train_label = LabelEncoder().fit_transform(train_df['loan_status'])
y_train_label

array([1, 1, 1, ..., 0, 0, 0])

In [10]:
# Convert categorical data to numeric and separate target feature for testing data
# drop the 'loan_status' to create train data
x_test = test_df.drop('loan_status', axis=1)
# One-hot encoding the entire dataframe
x_test_dummies = pd.get_dummies(x_test)
# convert the output lables to 0 and 1
y_test_label = LabelEncoder().fit_transform(test_df['loan_status'])
y_test_label

array([1, 1, 1, ..., 0, 0, 0])

In [12]:
# add missing dummy variables to testing set
missing_columns = set(x_train_dummies.columns) - set(x_test_dummies.columns)
for c in missing_columns:
    x_test_dummies[c] = 0
x_test_dummies = x_test_dummies[x_train_dummies.columns]

### Consider the Models: Prediction
I predict that the Logistic Regression model will perform better than the Random Forest Classifer model because the Random Forest Classifer does not tend to generalize as well & we are working with a fairly large number of features. 

### Fit a LogisticRegression model and RandomForestClassifier model

In [13]:
# train the Logistic Regression model on the unscaled data and print the model score
# create logistic model
classifier = LogisticRegression()
# Fit train data into the model by using training data
classifier.fit(x_train_dummies, y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [14]:
print(f"Training Data Score: {classifier.score(x_train_dummies, y_train_label)}")
print(f"Testing Data Score: {classifier.score(x_test_dummies, y_test_label)}")

Training Data Score: 0.6485221674876848
Testing Data Score: 0.5253083794130158


In [16]:
# train a Random Forest Classifier model and print the model score
classifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(x_train_dummies, y_train_label)
print(f'Training Score: {classifier.score(x_train_dummies, y_train_label)}')
print(f'Testing Score: {classifier.score(x_test_dummies, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6180348787749894


### Revisit the Preprocessing: Scale the data

In [18]:
# scale the data
scaler = StandardScaler().fit(x_train_dummies)
x_train_scaled = scaler.transform(x_train_dummies)
x_test_scaled = scaler.transform(x_test_dummies)

In [20]:
# train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression().fit(X_train_scaled, y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print(f'Training Score: {classifier.score(x_train_scaled, y_train_label)}')
print(f'Testing Score: {classifier.score(x_test_scaled, y_test_label)}')

Training Score: 0.7132183908045977
Testing Score: 0.7201190982560612


In [22]:
# train a Random Forest Classifier model on the scaled data and print the model score
classifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train_label)
print(f'Training Score: {classifier.score(X_train_scaled, y_train_label)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6193109315185028


### Conclusion
#### Logistic Regression Model Scores:
Unscaled Data: <br/>
Training Data Score: 0.6485221674876848 <br/>
Testing Data Score: 0.5253083794130158 <br/>
Scaled Data: <br/>
Training Score: 0.7132183908045977 <br/>
Testing Score: 0.7201190982560612
#### Random Forest Model Scores:
Unscaled Data:
Training Score: 1.0
Testing Score: 0.6180348787749894
Scaled Data:
Training Score: 1.0
Testing Score: 0.6193109315185028

##### How do the model scores compare to each other, and to the previous results on unscaled data? 
Scaling the data did not have much of an effect on the Random Forest Classifier Model. However, scaling the data had a positive effect on the Logistic Regression Model. Both training and testing scores improved and also got closer together, leading me to believe this is a good model for generalizing. 
##### How does this compare to your prediction? Write down your results and thoughts.
My prediction that the Logistic Regression Model would perform better seems to be accurate, since the scores are better, at least with scaled data. Using the Logistic Regression model with scaled data is the best choice for evaluating credit risk.