In [1]:
#Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Import training and testing data separately
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Preview training data
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
# Preview Testing data
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [5]:
# Convert categorical data to numeric and separate target feature for training data
train_df_sample = pd.get_dummies(train_df)
train_df_x = train_df_sample.drop(['loan_status_low_risk', 'loan_status_high_risk'], axis=1)
train_df_y = train_df_sample['loan_status_low_risk'].values
print(train_df_x.head())
print(f"train_df_y array is {train_df_x}.")

   Unnamed: 0   index  loan_amnt  int_rate  installment  annual_inc    dti  \
0       57107   57107    13375.0    0.1797       483.34    223000.0  29.99   
1      141451  141451    21000.0    0.1308       478.68    123000.0  11.26   
2      321143  321143    20000.0    0.1240       448.95    197000.0  11.28   
3       11778   11778     3000.0    0.1240       100.22     45000.0  18.08   
4      169382  169382    30000.0    0.1612      1056.49    133000.0  27.77   

   delinq_2yrs  inq_last_6mths  open_acc  ...  verification_status_Verified  \
0          0.0             0.0      15.0  ...                             0   
1          2.0             0.0      16.0  ...                             0   
2          0.0             0.0      12.0  ...                             0   
3          0.0             0.0      12.0  ...                             0   
4          0.0             2.0      13.0  ...                             0   

   pymnt_plan_n  initial_list_status_f  initial_list_sta

In [6]:
# Check if there are any missing variables in the training data set
train_df_x.isnull().sum(axis=0)

Unnamed: 0                    0
index                         0
loan_amnt                     0
int_rate                      0
installment                   0
                             ..
application_type_Joint App    0
hardship_flag_N               0
hardship_flag_Y               0
debt_settlement_flag_N        0
debt_settlement_flag_Y        0
Length: 94, dtype: int64

In [7]:
# Convert categorical data to numeric and separate target feature for testing data
test_df_sample = pd.get_dummies(test_df)
test_df_x = test_df_sample.drop(['loan_status_low_risk', 'loan_status_high_risk'], axis=1)
test_df_y = test_df_sample['loan_status_low_risk'].values
print(test_df_x.head())
print(f"train_df_y array is {train_df_y}.")

   Unnamed: 0  index  loan_amnt  int_rate  installment  annual_inc    dti  \
0       67991  67991    40000.0    0.0819       814.70    140000.0  19.75   
1       25429  25429     6000.0    0.1524       208.70     55000.0  11.52   
2       38496  38496     3600.0    0.1695       128.27     42000.0   6.74   
3       19667  19667    20000.0    0.1524       478.33    100000.0  12.13   
4       37505  37505     3600.0    0.1240       120.27     50000.0  16.08   

   delinq_2yrs  inq_last_6mths  open_acc  ...  \
0          0.0             1.0      18.0  ...   
1          2.0             0.0       8.0  ...   
2          0.0             0.0       6.0  ...   
3          0.0             2.0       7.0  ...   
4          0.0             3.0       6.0  ...   

   verification_status_Source Verified  verification_status_Verified  \
0                                    0                             0   
1                                    0                             0   
2                         

In [8]:
# Check if there are any missing variables in the testing data set
test_df_x.isnull().sum(axis=0)

Unnamed: 0                     0
index                          0
loan_amnt                      0
int_rate                       0
installment                    0
                              ..
application_type_Individual    0
application_type_Joint App     0
hardship_flag_N                0
hardship_flag_Y                0
debt_settlement_flag_N         0
Length: 93, dtype: int64

In [9]:
# add missing dummy variables to testing set just in case
test_df_x['debt_settlement_flag_Y'] = 0

In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier
classifier.fit(train_df_x, train_df_y)
classifier.fit(test_df_x, test_df_y)

print(f"The Logistic Regression model Score for the unscaled training data is  {classifier.score(train_df_x, train_df_y)}")
print(f"The Logistic Regression model Score for the unscaled testing data is  {classifier.score(test_df_x, test_df_y)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training Data Score: 0.5132183908045977
Testing Data Score: 0.8268821777966823


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
# Train a Random Forest Classifier model and print the model score
rlf_train = RandomForestClassifier(random_state=1, n_estimators=500).fit(train_df_x, train_df_y)
rlf_test = RandomForestClassifier(random_state=1, n_estimators=500).fit(test_df_x, test_df_y)
print(f'The Random Forest Classifier model Score for the unscaled training data is 
      {rlf_train.score(train_df_x, train_df_y)}')
print(f'The Random Forest Classifier model Score for the unscaled training data is 
      {rlf_test.score(test_df_x, test_df_y)}')

Training Score: 1.0
Testing Score: 1.0


In [12]:
# Scale the data
scaler = StandardScaler().fit(train_df_x)
X_train_scaled = scaler.transform(train_df_x)
X_test_scaled = scaler.transform(test_df_x)
reg_train = LinearRegression().fit(X_train_scaled, train_df_y)
reg_test = LinearRegression().fit(X_test_scaled, test_df_y)
print(f"Linear Regression Score for the scaled training data is {reg_train.score(X_train_scaled, train_df_y)}.")
print(f"Linear Regression Score for the testing data is {reg_test.score(X_test_scaled, test_df_y)}.")

Linear Regression Score for the training data is 0.15694581802378016.
Linear Regression Score for the testing data is 0.42057816541051896.


In [13]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_train_scaled = LogisticRegression().fit(X_train_scaled, train_df_y)
clf_test_scaled = LogisticRegression().fit(X_test_scaled, test_df_y)
print(f'The Logistic Regression Score for the scaled data is 
      {clf_train_scaled.score(X_train_scaled, train_df_y)}')
print(f'The Logistic Regression Score for the scaled data is 
      {clf_test_scaled.score(X_test_scaled, test_df_y)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training Score for the scaled data is 0.713136288998358
Testing Score for the scaled data is 0.893236920459379


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rlf_train_scaled = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, train_df_y)
rlf_test_scaled = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_test_scaled, test_df_y)
print(f'Random Forest Classifier model score for the scaled training data is 
      {rlf_train_scaled.score(X_train_scaled, train_df_y)}')
print(f'Random Forest Classifier model score for the scaled testing data is 
      {rlf_test_scaled.score(X_test_scaled, test_df_y)}')

Random Forest Classifier model score for the scaled training data is 1.0
Random Forest Classifier model score for the scaled testing data is 1.0


In [15]:
# Create a decision tree model
clf = DecisionTreeClassifier().fit(X_train_scaled, train_df_y)
y_pred = clf.predict(X_test_scaled)
confusion_matrix(test_df_y, y_pred)

array([[ 868, 1483],
       [ 424, 1927]])