In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Checking train_df for checking the data type and null values
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  12180 non-null  int64  
 1   index                       12180 non-null  int64  
 2   loan_amnt                   12180 non-null  float64
 3   int_rate                    12180 non-null  float64
 4   installment                 12180 non-null  float64
 5   home_ownership              12180 non-null  object 
 6   annual_inc                  12180 non-null  float64
 7   verification_status         12180 non-null  object 
 8   loan_status                 12180 non-null  object 
 9   pymnt_plan                  12180 non-null  object 
 10  dti                         12180 non-null  float64
 11  delinq_2yrs                 12180 non-null  float64
 12  inq_last_6mths              12180 non-null  float64
 13  open_acc                    121

In [4]:
# Checking test_df for checking the data type and null values
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  4702 non-null   int64  
 1   index                       4702 non-null   int64  
 2   loan_amnt                   4702 non-null   float64
 3   int_rate                    4702 non-null   float64
 4   installment                 4702 non-null   float64
 5   home_ownership              4702 non-null   object 
 6   annual_inc                  4702 non-null   float64
 7   verification_status         4702 non-null   object 
 8   loan_status                 4702 non-null   object 
 9   pymnt_plan                  4702 non-null   object 
 10  dti                         4702 non-null   float64
 11  delinq_2yrs                 4702 non-null   float64
 12  inq_last_6mths              4702 non-null   float64
 13  open_acc                    4702 

In [5]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df["loan_status"]
X_train = train_df.drop(columns = ["loan_status"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0


In [6]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df["loan_status"]
X_test = test_df.drop(columns = ["loan_status"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,0,1,0,1,1,0,1,0,1


In [7]:
# add missing dummy variables to testing set
# Get the name of missing columns
missing_cols = set( X_train.columns ) - set( X_test.columns )
print(missing_cols)
# Add the missing columns in the testing set and filling them with 0
for col in missing_cols:
    X_test[col] = 0

# Align the order of the columns in the training and testing sets
X_train, X_test = X_train.align(X_test, axis=1)

# Confirm that the columns are aligned and none is missing
X_train.columns==X_test.columns

{'debt_settlement_flag_Y'}


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score
reg = LogisticRegression(solver='lbfgs', 
    max_iter=100,
    random_state=0
).fit(X_train, y_train)
reg.score(X_test, y_test)

print('Logisitc Regression Model - Unscaled Data')
print("------------------------------------------")
print(f'Training Score: {reg.score(X_train, y_train)}')
print(f'Testing Score: {reg.score(X_test, y_test)}')

Logisitc Regression Model - Unscaled Data
------------------------------------------
Training Score: 0.648440065681445
Testing Score: 0.5253083794130158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Train a Random Forest Classifier model and print the model score
randomForestClass = RandomForestClassifier(random_state=40,n_estimators=200).fit(X_train,y_train)
randomForestClass.fit(X_train, y_train)

print('Random Forest Classifier Model - Unscaled Data')
print("------------------------------------")
print(f'Training Score: {randomForestClass.score(X_train, y_train)}')
print(f'Testing Score: {randomForestClass.score(X_test, y_test)}')

Random Forest Classifier Model - Unscaled Data
------------------------------------
Training Score: 1.0
Testing Score: 0.6086771586558911


In [10]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Train the Logistic Regression model on the scaled data and print the model score
logReg = LogisticRegression(
    solver='lbfgs',
    max_iter=100,
    random_state=0
)
logReg.fit(X_train_scaled, y_train)

print('Logisitc Regression Model - Scaled Data')
print("----------------------------------------")
print(f'Training Score: {logReg.score(X_train_scaled, y_train)}')
print(f'Testing Score: {logReg.score(X_test_scaled, y_test)}')

Logisitc Regression Model - Scaled Data
----------------------------------------
Training Score: 0.713136288998358
Testing Score: 0.7201190982560612


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Train a Random Forest Classifier model on the scaled data and print the model score

rndmForClass = RandomForestClassifier(random_state=1,n_estimators=500).fit(X_train_scaled,y_train)
rndmForClass.fit(X_train_scaled, y_train)
print('Random Forest Classifier Model - Scaled Data')
print("--------------------------------------------")
print("Training Score: ", rndmForClass.score(X_train_scaled, y_train))
print("Testing Score: ", rndmForClass.score(X_test_scaled, y_test))


Random Forest Classifier Model - Scaled Data
--------------------------------------------
Training Score:  1.0
Testing Score:  0.6193109315185028


# Results and Predictions

## Logisitc Regression Model - Unscaled Data

Training Score: 0.648440065681445 
<br>
<br>
Testing Score: 0.5253083794130158

## Random Forest Classifier Model - Unscaled Data

Training Score: 1.0 <br>
<br>
Testing Score: 0.6086771586558911

## Logisitc Regression Model - Scaled Data

Training Score: 0.713136288998358 <br>
<br>
Testing Score: 0.7201190982560612


## Random Forest Classifier Model - Scaled Data

Training Score:  1.0 <br>
<br>
Testing Score:  0.6193109315185028


Based on the result analysis, It shows that Logistic Regression Model does better than Random Forest Model. Also, random forest classifier did not benefit much from scaling but Logistic regression did. 