In [146]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [27]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [102]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  4702 non-null   int64  
 1   index                       4702 non-null   int64  
 2   loan_amnt                   4702 non-null   float64
 3   int_rate                    4702 non-null   float64
 4   installment                 4702 non-null   float64
 5   home_ownership              4702 non-null   object 
 6   annual_inc                  4702 non-null   float64
 7   verification_status         4702 non-null   object 
 8   loan_status                 4702 non-null   object 
 9   pymnt_plan                  4702 non-null   object 
 10  dti                         4702 non-null   float64
 11  delinq_2yrs                 4702 non-null   float64
 12  inq_last_6mths              4702 non-null   float64
 13  open_acc                    4702 

In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [28]:
train_df['loan_status'].value_counts()


low_risk     6090
high_risk    6090
Name: loan_status, dtype: int64

In [14]:
train_df['home_ownership'].value_counts()

MORTGAGE    5800
RENT        4944
OWN         1371
ANY           65
Name: home_ownership, dtype: int64

In [None]:
train_df['verification_status'].value_counts()

In [16]:
train_df['pymnt_plan'].value_counts()


n    12180
Name: pymnt_plan, dtype: int64

In [17]:
train_df['hardship_flag'].value_counts()

N    11832
Y      348
Name: hardship_flag, dtype: int64

In [18]:
train_df['debt_settlement_flag'].value_counts()

N    12175
Y        5
Name: debt_settlement_flag, dtype: int64

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  12180 non-null  int64  
 1   index                       12180 non-null  int64  
 2   loan_amnt                   12180 non-null  float64
 3   int_rate                    12180 non-null  float64
 4   installment                 12180 non-null  float64
 5   home_ownership              12180 non-null  object 
 6   annual_inc                  12180 non-null  float64
 7   verification_status         12180 non-null  object 
 8   loan_status                 12180 non-null  object 
 9   pymnt_plan                  12180 non-null  object 
 10  dti                         12180 non-null  float64
 11  delinq_2yrs                 12180 non-null  float64
 12  inq_last_6mths              12180 non-null  float64
 13  open_acc                    121

In [98]:
# Convert categorical data to numeric and separate target feature for training data
dummies_train = pd.get_dummies(train_df, columns=['loan_status', 'home_ownership','verification_status','initial_list_status'
                                                  'pymnt_plan','hardship_flag', 'application_type' 
                                                  ,'debt_settlement_flag'],drop_first=True)
result=[train_df,dummies_train]
merged_train_df=pd.concat(result, axis=1)
merged_train_df


Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,loan_status_low_risk,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,initial_list_status_w,hardship_flag_Y,application_type_Joint App,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,1,1,0,0,0,0,1,0,0,0
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,1,1,0,0,1,0,1,0,0,0
2,321143,321143,20000.0,0.1240,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,1,1,0,0,1,0,1,0,0,0
3,11778,11778,3000.0,0.1240,100.22,RENT,45000.0,Not Verified,low_risk,n,...,1,0,0,1,0,0,1,0,0,0
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,1,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,high_risk,n,...,0,0,0,1,0,0,1,0,0,0
12176,354944,354944,15000.0,0.1774,540.34,RENT,50000.0,Verified,high_risk,n,...,0,0,0,1,0,1,1,0,0,0
12177,354973,354973,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,high_risk,n,...,0,0,0,1,0,0,1,0,0,0
12178,355002,355002,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,high_risk,n,...,0,1,0,0,1,0,1,0,1,0


In [99]:
#dropped all the categorical columns
final_train_df = merged_train_df.drop(['loan_status', 'home_ownership','verification_status','initial_list_status'
                                                  ,'pymnt_plan','hardship_flag', 'application_type' 
                                                  ,'debt_settlement_flag'], axis='columns')
final_train_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_low_risk,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,initial_list_status_w,hardship_flag_Y,application_type_Joint App,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,1,1,0,0,0,0,1,0,0,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,1,1,0,0,1,0,1,0,0,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,1,1,0,0,1,0,1,0,0,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,1,0,0,1,0,0,1,0,0,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,1,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,0,0,1,0,0,1,0,0,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,0,0,0,1,0,1,1,0,0,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,0,0,1,0,0,1,0,0,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,1,0,0,1,0,1,0,1,0


In [100]:
#rename the Loan status dummies to loan status
final_train_df= final_train_df.rename(columns={'loan_status_low_risk': 'loan_status'})
print(final_train_df['loan_status'])

0        1
1        1
2        1
3        1
4        1
        ..
12175    0
12176    0
12177    0
12178    0
12179    0
Name: loan_status, Length: 12180, dtype: uint8


In [110]:
# Convert categorical data to numeric and separate target feature for testing data
dummies_test = pd.get_dummies(test_df, columns=['loan_status', 'home_ownership','verification_status'
                                                 ,'initial_list_status','pymnt_plan','hardship_flag'
                                                 ,'application_type','debt_settlement_flag'],drop_first=True)
result=[test_df,dummies_test]
merged_test_df=pd.concat(result, axis=1)
merged_test_df
#dropped all the categorical columns
final_test_df = merged_test_df.drop(['loan_status', 'home_ownership','verification_status','initial_list_status'
                                                  ,'pymnt_plan','hardship_flag', 'application_type' 
                                                  ,'debt_settlement_flag'], axis='columns')

In [111]:
final_test_df= final_test_df.rename(columns={'loan_status_low_risk': 'loan_status'})
print(final_test_df['loan_status'])

0       1
1       1
2       1
3       1
4       1
       ..
4697    0
4698    0
4699    0
4700    0
4701    0
Name: loan_status, Length: 4702, dtype: uint8


In [66]:
#standard correlation coeficient between loan status and all other data
corr_matrix=final_test_df.corr()
corr_matrix['loan_status'].sort_values(ascending=False).head(10)


loan_status        1.000000
total_rec_prncp    0.395376
total_rec_prncp    0.395376
total_pymnt_inv    0.377125
total_pymnt_inv    0.377125
total_pymnt        0.377104
total_pymnt        0.377104
last_pymnt_amnt    0.240257
last_pymnt_amnt    0.240257
out_prncp_inv      0.168154
Name: loan_status, dtype: float64

In [114]:
# add missing dummy variables to testing set
X=final_train_df.drop('loan_status',axis="columns")
y=final_train_df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [119]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
print(f"Training Score for Logistic Regression Model is: {model.score(X_train, y_train)}")

# for the test Sccore
X=final_test_df.drop('loan_status',axis="columns")
y=final_test_df['loan_status']

#print(f"Testing Data Score: {model.score(X, y)}")
#predictions = model.predict(X)

# linear logistic model does not appear to be a very good model since the  score is low 

Training Score for Logistic Regression Model is: 0.655719759168035


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [138]:
# Train a Random Forest Classifier model and print the model score
X = final_test_df.drop('loan_status',axis='columns')
y = final_test_df['loan_status']
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X, y)
print(f"Training Score for Random Forest Classifier is: {clf.score(X, y)}")

#perfect score so Random Forest Classifier model is a better model to predict 


Training Score for Random Forest Classifier is: 1.0


In [144]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.65414693, -0.65414693, -1.21011981, ..., -0.1694586 ,
        -0.41552046, -0.01812499],
       [ 1.5929921 ,  1.5929921 , -1.327905  , ..., -0.1694586 ,
        -0.41552046, -0.01812499],
       [ 0.83663139,  0.83663139, -1.21011981, ..., -0.1694586 ,
        -0.41552046, -0.01812499],
       ...,
       [ 0.41312268,  0.41312268,  0.16404075, ..., -0.1694586 ,
        -0.41552046, -0.01812499],
       [ 1.67466527,  1.67466527, -0.22857655, ..., -0.1694586 ,
        -0.41552046, -0.01812499],
       [ 0.13666574,  0.13666574,  0.85112103, ..., -0.1694586 ,
         2.40662036, -0.01812499]])

In [151]:
# Train the Logistic Regression model on the scaled data and print the model score
model = LogisticRegression()
model.fit(X_train_scaled,y_train)
print(f"Scaled Training Data Score: {model.score(X_train_scaled, y_train)}")



Scaled Training Data Score: 1.0


In [147]:
# Train a Random Forest Classifier model on the scaled data and print the model score
X, y = make_classification(random_state=1, n_features=50, n_informative=5, n_redundant=0)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [148]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.76
