# Credit Risk Evaluator

## Supervised machine learning challenge

In [83]:
# !pip install ipywidgets
# !pip install --upgrade jupyter_core jupyter_client
!jupyter nbextension enable --py widgetsnbextension

Config option `kernel_spec_manager_class` not recognized by `EnableNBExtensionApp`.
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [84]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [85]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [86]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [87]:
# Identify the independent variables
X_train = train_df.drop(columns=['loan_status'])
X_test = test_df.drop(columns=['loan_status'])

In [88]:
# Convert categorical columns to numeric
X_train_dummies = pd.get_dummies(X_train)
print(X_train_dummies.columns)

X_test_dummies = pd.get_dummies(X_test)
X_test_dummies['debt_settlement_flag_Y'] = 0 # added to match number of columns with train_df_dummies
print(X_test_dummies.columns)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [89]:
# Set up dependent variables
y_train_label = LabelEncoder().fit_transform(train_df['loan_status'])
y_test_label = LabelEncoder().fit_transform(test_df['loan_status'])

## Prediction prior to running models (unscaled data)

My assumption is that LogisticRegression model will be more accurate for this dataset.  The reason for this is that I believe the columns of data are highly correlated, for example, if one of the columns of data suggests a higher loan risk, then the other columns will also suggest this.  
  
  My assumption is that LogisticRegression is better for binary, highly correlated datasets and RandomForest method is better for more complex datasets where correlations are not easily identified.

In [90]:
# Initialize Logistic Regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=100_000)

# Fit the unscaled data
classifier.fit(X_train_dummies, y_train_label)

# Print the scores of the model on the training and test data sets
print(f"Training Data Score: {classifier.score(X_train_dummies, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_dummies, y_test_label)}")

Training Data Score: 0.7008210180623974
Testing Data Score: 0.5648660144619311


In [91]:
# Initialize the Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

# Fit the data
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_dummies, y_train_label)

# Print the scores
print(f'Training Score: {clf.score(X_train_dummies, y_train_label)}')
print(f'Testing Score: {clf.score(X_test_dummies, y_test_label)}')

Training Score: 0.9999178981937603
Testing Score: 0.6352615908124203


## Post Model Results Analysis (unscaled data)
  
  After running both the Logistic Regression and Random Forest models, it's clear that for the unscaled data, the Random Forest model is superior to the Logistic Regression model in terms of accuracy.

In [92]:
# Scale the data, preprocessing
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

## Prediction prior to running models (scaled data)
  
  My prediction is that the scaled models will provide more accurate modeling for the LogisticRegression as the scaled model will be less complex, more regularized.  
  
  However, for the Random Forest model, I think the results will be unchanged because convergence and precision issues aren't as important for RandomForest as it is for Logistic or Linear regression models.

In [93]:
# Initialize the Logistic Regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000)

# Fit the data
classifier.fit(X_train_scaled, y_train_label)

# Print out the scores
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test_label)}")

Training Data Score: 0.712807881773399
Testing Data Score: 0.7203317737133135


In [94]:
# Initialize the Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

# Fit the data
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train_label)

# Print out the scores
print(f'Training Score: {clf.score(X_train_scaled, y_train_label)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test_label)}')

Training Score: 0.9999178981937603
Testing Score: 0.6337728626116547


## Post Model Results Analysis (scaled data)
  
  The Logistic Regression model is superior with scaled data.  
  The results of Random Forest using the scaled data are unchanged from the results of the unscaled data.

In [95]:
# This code used to verify that column names match for each dataframe and to identify any missing columns
# train_dummies_column_names = list(train_df_dummies.columns)
# train_dummies_df_column_names = pd.DataFrame(train_dummies_column_names)
# train_dummies_df_column_names.to_csv('train_df_dummies_column_names.csv')

# test_dummies_column_names = list(test_df_dummies.columns)
# test_dummies_df_column_names = pd.DataFrame(test_dummies_column_names)
# test_dummies_df_column_names.to_csv('test_df_dummies_column_names.csv')

# This code used to explore datatypes of train_df and test_df
# train_df.info()
# test_df.info()