#### Author: Meet K Sahni

#### In this notebook:

1) 2019loans and 2020Q1Loans are read into dataframes. &emsp;
2) After analyzing the columns and shape of dataframes, the training & testing datasets are broken into X (features) and y (label) values. &emsp;
3) The training & test numeric data (X) is converted to categorical data using pd.get_dummies.&emsp;
4) The y values (for both training & test datasets) are converted to numeric using LabelEncoder (since we do not want two different columns for labels)&emsp;
5) Since the training & test datasets are not equal, the missing column(s) are found and inserted in test dataset.&emsp;
6) Logistic Regression model is trained on the unscaled data and the model score is printed.
7) Random Forest Classifier model is trained on the unscaled data and the model score is printed.&emsp;
8) The scores of both the models are compared on unscaled data.&emsp;
9) The training & testing data is then scaled using StandardScaler() function. &emsp;
10) Both the models (Logical Regression & Random Forest Classifier) are re-applied to our training & testing datasets after they are scaled.&emsp;
11) The scores are calculated and the code is concluded.&emsp;

In [70]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [71]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
set(train_df.columns)

{'Unnamed: 0',
 'acc_now_delinq',
 'acc_open_past_24mths',
 'all_util',
 'annual_inc',
 'application_type',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'collection_recovery_fee',
 'collections_12_mths_ex_med',
 'debt_settlement_flag',
 'delinq_2yrs',
 'delinq_amnt',
 'dti',
 'hardship_flag',
 'home_ownership',
 'il_util',
 'index',
 'initial_list_status',
 'inq_fi',
 'inq_last_12m',
 'inq_last_6mths',
 'installment',
 'int_rate',
 'last_pymnt_amnt',
 'loan_amnt',
 'loan_status',
 'max_bal_bc',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_rcnt_il',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'open_acc',
 'open_ac

In [4]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [5]:
print(train_df.shape)
print(test_df.shape)

(12180, 86)
(4702, 86)


In [6]:
train_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

### Convert categorical data to numeric and separate target feature for training data

In [28]:
y_candidate1 = train_df["loan_status"]  # define label
X_candidate1 = train_df.drop(columns=["loan_status"])  # drop label from features

In [29]:
#cols=[i for i in train_df.columns if i not in ["loan_status"]]
#for col in cols:
# One-hot encoding the X dataframe
X_train = pd.get_dummies(X_candidate1)
X_train.shape

(12180, 94)

In [33]:
#add LabelEncoder to y label in the training data
y_train = LabelEncoder().fit_transform(y_candidate1)
y_train

array([1, 1, 1, ..., 0, 0, 0])

#### Convert categorical data to numeric and separate target feature for testing data

In [27]:
test_df.shape

(4702, 86)

In [31]:
y_candidate2 = test_df["loan_status"]  # define label
X_candidate2 = test_df.drop(columns=["loan_status"])  # drop label from features

In [32]:
# One-hot encoding the X dataframe (test)
X_test = pd.get_dummies(X_candidate2)
X_test.shape

(4702, 93)

In [34]:
#add LabelEncoder to y label in the training data
y_test = LabelEncoder().fit_transform(y_candidate2)
y_test

array([1, 1, 1, ..., 0, 0, 0])

In [35]:
# add missing dummy variables to testing set
missing_cols = set(X_train.columns) - set(X_test.columns)
missing_cols

{'debt_settlement_flag_Y'}

In [38]:
cols = X_train.columns
for col in cols:
    if col in X_test.columns:
        print ("Column found")
    else:
        print(col)

Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found
Column found

In [39]:
#Add missing column in test dataset
X_test['debt_settlement_flag_Y'] = 0

In [41]:
#confirm if the column got added in X_test dataset
X_test.shape

(4702, 94)

In [67]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=2000,solver = 'lbfgs')
classifier

LogisticRegression(max_iter=2000)

In [68]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=2000)

In [54]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6982758620689655
Testing Data Score: 0.5723096554657593


### As seen above, the training & Testing Data Scores for Logical Regression Model on Unscaled Data are low. 

In [58]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500)
clf

RandomForestClassifier(n_estimators=500, random_state=1)

In [59]:
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=1)

In [60]:
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6180348787749894


### On the other hand, the Training Data Scores for Random Forest Classfier Model on Unscaled Data are 100%. The testing data scores are not good but they are a little better than Logical Regression Model.

In [72]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [63]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=2000)

In [64]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7128899835796387
Testing Data Score: 0.7201190982560612


### After scaling the data, the Logical Regression Model scores improved considerably.

In [65]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=500, random_state=1)

In [69]:
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6193109315185028


### However, the Random Forest Model scores remain the same for unscaled & scaled data.