In [2]:
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import numpy as np

In [3]:
loan_data_inputs_train = pd.read_csv(r'D:\Datasets\LendingClub\loan_data_inputs_train.csv', index_col=0)
loan_data_targets_train = pd.read_csv(r'D:\Datasets\LendingClub\loan_data_targets_train.csv', index_col=0).values.reshape(-1,)
loan_data_inputs_test = pd.read_csv(r'D:\Datasets\LendingClub\loan_data_inputs_test.csv', index_col=0)
loan_data_targets_test = pd.read_csv(r'D:\Datasets\LendingClub\loan_data_targets_test.csv', index_col=0).values.reshape(-1,)

In [4]:
print(loan_data_inputs_train.shape)
print(loan_data_targets_train.shape)
print(loan_data_inputs_test.shape)
print(loan_data_targets_test.shape)

(373028, 322)
(373028,)
(93257, 322)
(93257,)


In [5]:
# logistic regression
# easy to understand and interpret
# logistic function
# P(X) = e**X / (1 - e**X)
# probability of event is exponential of linear combination of coefficients dividend by 1 + the same exponential
# P(of event occuring)/1-P(of event not occurring) = exponential of linear combination of coefficients
# log odds
# dice
# 1/6 (probability of getting a 1)
# 5/6 (probability of not getting a 1)
# 1:5 odds or 0.2
# log odds (P(Y=1)/1-P(Y=0)) = B0 + B1X1 + B2X2+ ... BmXm
# all the independent variables either 0 or 1
# the coefficient shows us the difference in log odds caused in the final estimation by changing the variable from 0 to 1
# to get the odds we take e**(coefficient[B1])
# eg. if e**(B1) = 2.25
# and B1 is higher having a mortgage and not being in default
# 2.25 times the odds of having a mortgage and being in default
# 125% more likely to not be in default if have a mortgage than not

### Selecting the Features

In [6]:
inputs_train_with_ref_cat = loan_data_inputs_train.loc[: , ['grade:A',
'grade:B',
'grade:C',
'grade:D',
'grade:E',
'grade:F',
'grade:G',
'home_ownership:RENT_OTHER_NONE_ANY',
'home_ownership:OWN',
'home_ownership:MORTGAGE',
'addr_state:ME_DC_WY_WV_VT_MS',
'addr_state:NY',
'addr_state:RI_NM_NC_OK_SD_VA_LA',
'addr_state:MD_NJ_MO',
'addr_state:CA',
'addr_state:ID_TN',
'addr_state:AZ_MI_UT_AR_PA_OH_MN_IN',
'addr_state:KY_MA_GA_WA_WI_DE',
'addr_state:OR_IL',
'addr_state:TX',
'addr_state:CT_SC_AK',
'addr_state:KS_CO_MT_NH',
'verification_status:Verified',
'verification_status:Source Verified',
'verification_status:Not Verified',
'purpose:edu_sb_re_m_h_other',
'purpose:med_wed_vac',
'purpose:debt_consolidation',
'purpose:hi_mp_car',
'purpose:credit_card',
'initial_list_status:f',
'initial_list_status:w',
'term:36',
'term:60',
'emp_length:0',
'emp_length:1',
'emp_length:2-4',
'emp_length:5-6',
'emp_length:7-9',
'emp_length:10',
'mths_since_issue_d:<38',
'mths_since_issue_d:38-39',
'mths_since_issue_d:<40-41',
'mths_since_issue_d:<42-48',
'mths_since_issue_d:<49-52',
'mths_since_issue_d:<53-64',
'mths_since_issue_d:<65-84',
'mths_since_issue_d:>84',
'int_rate:<9.548',
'int_rate:9.548-12.025',
'int_rate:12.025-15.74',
'int_rate:15.74-20.281',
'int_rate:>20.281',
'mths_since_earliest_cr_line:<142',
'mths_since_earliest_cr_line:142-266',
'mths_since_earliest_cr_line:266-355',
'mths_since_earliest_cr_line:>355',
'delinq_2yrs:0',
'delinq_2yrs:1-3',
'delinq_2yrs:>=4',
'inq_last_6mths:0',
'inq_last_6mths:1-2',
'inq_last_6mths:3-6',
'inq_last_6mths:>6',
'open_acc:0',
'open_acc:1-3',
'open_acc:4-12',
'open_acc:13-17',
'open_acc:18-22',
'open_acc:23-25',
'open_acc:26-30',
'open_acc:>=31',
'pub_rec:0-2',
'pub_rec:3-4',
'pub_rec:>=5',
'total_acc:<=25',
'total_acc:25-51',
'total_acc:>=51',
'acc_now_delinq:0',
'acc_now_delinq:>=1',
'total_rev_hi_lim:<=5K',
'total_rev_hi_lim:5K-10K',
'total_rev_hi_lim:10K-20K',
'total_rev_hi_lim:20K-30K',
'total_rev_hi_lim:30K-40K',
'total_rev_hi_lim:40K-55K',
'total_rev_hi_lim:55K-95K',
'total_rev_hi_lim:>95K',
'annual_inc:<20K',
'annual_inc:20K-30K',
'annual_inc:30K-40K',
'annual_inc:40K-50K',
'annual_inc:50K-60K',
'annual_inc:60K-70K',
'annual_inc:70K-80K',
'annual_inc:80K-90K',
'annual_inc:90K-100K',
'annual_inc:100K-120K',
'annual_inc:120K-140K',
'annual_inc:>140K',
'mths_since_last_delinq:Missing',
'mths_since_last_delinq:0-3',
'mths_since_last_delinq:4-30',
'mths_since_last_delinq:31-56',
'mths_since_last_delinq:>=57',
'dti:<=1.4',
'dti:1.4-3.5',
'dti:3.5-7.7',
'dti:7.7-10.5',
'dti:10.5-16.1',
'dti:16.1-20.3',
'dti:20.3-21.7',
'dti:21.7-22.4',
'dti:22.4-35',
'dti:>35',
'mths_since_last_record:Missing',
'mths_since_last_record:0-2',
'mths_since_last_record:3-20',
'mths_since_last_record:21-31',
'mths_since_last_record:32-80',
'mths_since_last_record:81-86',
'mths_since_last_record:>=86',
]]

In [7]:
# removing one dummy variable for k-1
ref_categories = ['grade:G',
                'home_ownership:RENT_OTHER_NONE_ANY',
                'addr_state:ND_NE_IA_NV_HI_FL_AL',
                'verification_status:Verified',
                'purpose:edu_sb_re_m_h_other',
                 'initial_list_status:f',
                 'term:60',
                 'emp_length:0',
                 'mths_since_issue_d:>84',
                 'int_rate:>20.281',
                 'mths_since_earliest_cr_line:<142',
                 'delinq_2yrs:>=4',
                 'inq_last_6mths:>6',
                 'open_acc:0',
                 'pub_rec:>=5',
                 'total_acc:<=25',
                 'acc_now_delinq:>=1',
                 'total_rev_hi_lim<=5K',
                 'annual_inc:<20K',
                 'mths_since_last_delinq:Missing',
                 'dti>35',
                 'mths_since_last_record:Missing']

In [8]:
# selecting the desires columns
sel_cols = [c for c in inputs_train_with_ref_cat.columns if c not in ref_categories]

In [9]:
# selecting columns that are not reference categories
inputs_train = loan_data_inputs_train[sel_cols]
# loan_data_targets_train
inputs_test = loan_data_inputs_test[inputs_train.columns]
# loan_data_targets_test

In [10]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(max_iter=2000)

reg.fit(inputs_train, loan_data_targets_train)

In [11]:
reg.intercept_

array([-0.60606753])

In [12]:
reg.coef_

array([[ 1.10994621e+00,  8.87684270e-01,  6.88033165e-01,
         5.11485587e-01,  3.60316768e-01,  2.13199391e-01,
         7.45446463e-02,  1.11393565e-01,  4.91050661e-01,
         7.71636258e-02,  6.41510748e-02,  6.42106902e-02,
         8.30172696e-02,  3.40241894e-02,  1.41555550e-01,
         1.85759818e-01,  2.63607028e-01,  2.58482960e-01,
         3.28691239e-01,  3.64352046e-01, -2.37687445e-02,
         8.89005730e-02,  1.86771636e-01,  8.81655413e-02,
         1.65233262e-01,  1.99857882e-01,  4.86437348e-02,
         7.21710404e-02,  1.06338513e-01,  1.20397074e-01,
         8.16151825e-02,  8.51422286e-02,  1.31479499e-01,
         1.15250432e+00,  9.33648246e-01,  8.29513590e-01,
         6.07535183e-01,  4.32404896e-01,  1.80646642e-01,
        -7.07970879e-02,  9.51654571e-01,  5.97159221e-01,
         3.40111104e-01,  1.44764679e-01,  2.58795453e-02,
         9.18430478e-02,  1.11044453e-01,  8.15547064e-02,
         4.75084841e-02,  7.29885553e-01,  5.73362233e-0

In [13]:
feature_name = inputs_train.columns.values

In [14]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg.coef_)
# add the intercept
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-0.606068
1,grade:A,1.109946
2,grade:B,0.887684
3,grade:C,0.688033
4,grade:D,0.511486
5,grade:E,0.360317
6,grade:F,0.213199
7,home_ownership:OWN,0.074545
8,home_ownership:MORTGAGE,0.111394
9,addr_state:ME_DC_WY_WV_VT_MS,0.491051


### Build a logistic modele with multivariate p-values

In [15]:
# P values for sklearn logistic regression.

# Class to display p-values for logistic regression in sklearn.

from sklearn import linear_model
import scipy.stats as stat

class LogisticRegression_with_p_values:
    
    def __init__(self,*args,**kwargs):#,**kwargs):
        self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)

    def fit(self,X,y):
        self.model.fit(X,y)
        
        #### Get p-values for the fitted model ####
        denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
        p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values
        
        self.coef_ = self.model.coef_
        self.intercept_ = self.model.intercept_
        self.p_values = p_values

In [16]:
reg = LogisticRegression_with_p_values(max_iter=2000)

In [17]:
reg.fit(inputs_train, loan_data_targets_train)

  sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))


In [18]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg.coef_)
# add the intercept
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-0.606068
1,grade:A,1.109946
2,grade:B,0.887684
3,grade:C,0.688033
4,grade:D,0.511486
5,grade:E,0.360317
6,grade:F,0.213199
7,home_ownership:OWN,0.074545
8,home_ownership:MORTGAGE,0.111394
9,addr_state:ME_DC_WY_WV_VT_MS,0.491051


In [19]:
p_values = reg.p_values
# added nan for intercept
p_values = np.append(np.nan, np.array(p_values))

In [20]:
summary_table['p_values'] = p_values

In [21]:
summary_table

Unnamed: 0,Feature name,Coefficients,p_values
0,Intercept,-0.606068,
1,grade:A,1.109946,5.7267770000000005e-33
2,grade:B,0.887684,2.440938e-47
3,grade:C,0.688033,1.260455e-33
4,grade:D,0.511486,3.210966e-21
5,grade:E,0.360317,7.968438e-14
6,grade:F,0.213199,2.297782e-05
7,home_ownership:OWN,0.074545,0.0002232245
8,home_ownership:MORTGAGE,0.111394,3.374166e-18
9,addr_state:ME_DC_WY_WV_VT_MS,0.491051,1.116169e-19


In [22]:
# if one or few dummy variables representing one original indendent variable are
# statistically significant, it would be best to retain all the variables related that original independent variable
# if p < 0.05 = statistically significant
# remove 
#
# delinq_2yrs
# open_acc
# pub_rec
# total_rev_hi_lim
# total_acc
# 
# as not statistically significant

In [23]:
inputs_train_with_ref_cat = loan_data_inputs_train.loc[: , ['grade:A',
'grade:B',
'grade:C',
'grade:D',
'grade:E',
'grade:F',
'grade:G',
'home_ownership:RENT_OTHER_NONE_ANY',
'home_ownership:OWN',
'home_ownership:MORTGAGE',
'addr_state:ME_DC_WY_WV_VT_MS',
'addr_state:NY',
'addr_state:RI_NM_NC_OK_SD_VA_LA',
'addr_state:MD_NJ_MO',
'addr_state:CA',
'addr_state:ID_TN',
'addr_state:AZ_MI_UT_AR_PA_OH_MN_IN',
'addr_state:KY_MA_GA_WA_WI_DE',
'addr_state:OR_IL',
'addr_state:TX',
'addr_state:CT_SC_AK',
'addr_state:KS_CO_MT_NH',
'verification_status:Verified',
'verification_status:Source Verified',
'verification_status:Not Verified',
'purpose:edu_sb_re_m_h_other',
'purpose:med_wed_vac',
'purpose:debt_consolidation',
'purpose:hi_mp_car',
'purpose:credit_card',
'initial_list_status:f',
'initial_list_status:w',
'term:36',
'term:60',
'emp_length:0',
'emp_length:1',
'emp_length:2-4',
'emp_length:5-6',
'emp_length:7-9',
'emp_length:10',
'mths_since_issue_d:<38',
'mths_since_issue_d:38-39',
'mths_since_issue_d:<40-41',
'mths_since_issue_d:<42-48',
'mths_since_issue_d:<49-52',
'mths_since_issue_d:<53-64',
'mths_since_issue_d:<65-84',
'mths_since_issue_d:>84',
'int_rate:<9.548',
'int_rate:9.548-12.025',
'int_rate:12.025-15.74',
'int_rate:15.74-20.281',
'int_rate:>20.281',
'mths_since_earliest_cr_line:<142',
'mths_since_earliest_cr_line:142-266',
'mths_since_earliest_cr_line:266-355',
'mths_since_earliest_cr_line:>355',
#'delinq_2yrs:0',
#'delinq_2yrs:1-3',
#'delinq_2yrs:>=4',
'inq_last_6mths:0',
'inq_last_6mths:1-2',
'inq_last_6mths:3-6',
'inq_last_6mths:>6',
#'open_acc:0',
#'open_acc:1-3',
#'open_acc:4-12',
#'open_acc:13-17',
#'open_acc:18-22',
#'open_acc:23-25',
#'open_acc:26-30',
#'open_acc:>=31',
#'pub_rec:0-2',
#'pub_rec:3-4',
#'pub_rec:>=5',
#'total_acc:<=25',
#'total_acc:25-51',
#'total_acc:>=51',
'acc_now_delinq:0',
'acc_now_delinq:>=1',
#'total_rev_hi_lim:<=5K',
#'total_rev_hi_lim:5K-10K',
#'total_rev_hi_lim:10K-20K',
#'total_rev_hi_lim:20K-30K',
#'total_rev_hi_lim:30K-40K',
#'total_rev_hi_lim:40K-55K',
#'total_rev_hi_lim:55K-95K',
#'total_rev_hi_lim:>95K',
'annual_inc:<20K',
'annual_inc:20K-30K',
'annual_inc:30K-40K',
'annual_inc:40K-50K',
'annual_inc:50K-60K',
'annual_inc:60K-70K',
'annual_inc:70K-80K',
'annual_inc:80K-90K',
'annual_inc:90K-100K',
'annual_inc:100K-120K',
'annual_inc:120K-140K',
'annual_inc:>140K',
'mths_since_last_delinq:Missing',
'mths_since_last_delinq:0-3',
'mths_since_last_delinq:4-30',
'mths_since_last_delinq:31-56',
'mths_since_last_delinq:>=57',
'dti:<=1.4',
'dti:1.4-3.5',
'dti:3.5-7.7',
'dti:7.7-10.5',
'dti:10.5-16.1',
'dti:16.1-20.3',
'dti:20.3-21.7',
'dti:21.7-22.4',
'dti:22.4-35',
'dti:>35',
'mths_since_last_record:Missing',
'mths_since_last_record:0-2',
'mths_since_last_record:3-20',
'mths_since_last_record:21-31',
'mths_since_last_record:32-80',
'mths_since_last_record:81-86',
'mths_since_last_record:>=86',
]]

In [24]:
# selecting the desires columns
sel_cols = [c for c in inputs_train_with_ref_cat.columns if c not in ref_categories]

In [25]:
# selecting columns that are not reference categories
inputs_train = loan_data_inputs_train[sel_cols]
# loan_data_targets_train
inputs_test = loan_data_inputs_test[inputs_train.columns]
# loan_data_targets_test

In [26]:
reg2 = LogisticRegression_with_p_values(max_iter=2000)
reg2.fit(inputs_train, loan_data_targets_train)

In [27]:
feature_name = inputs_train.columns.values

In [28]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg2.coef_)
# add the intercept
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg2.intercept_[0]]
summary_table = summary_table.sort_index()
p_values = reg2.p_values
# added nan for intercept
p_values = np.append(np.nan, np.array(p_values))
summary_table['p_values'] = p_values
summary_table

Unnamed: 0,Feature name,Coefficients,p_values
0,Intercept,-0.568964,
1,grade:A,1.111614,3.753207e-33
2,grade:B,0.887381,1.946959e-47
3,grade:C,0.688519,9.294358e-34
4,grade:D,0.512656,2.323139e-21
5,grade:E,0.361757,5.995327e-14
6,grade:F,0.214676,1.995375e-05
7,home_ownership:OWN,0.075539,0.000179887
8,home_ownership:MORTGAGE,0.109237,1.027099e-17
9,addr_state:ME_DC_WY_WV_VT_MS,0.489534,1.407016e-19


### Saving the model

In [29]:
import pickle

In [30]:
pickle.dump(reg2, open('pd_model.sav', 'wb'))

## Interpreting coefficients

In [31]:
# base of natural logarithm to the exponent of the coefficient gives us the ratio of odds of default occurring given that coefficient being present to that coefficient not being present
# higher coefficients = greater odds of being a good borrower, or lower odds of default
# odds are always relative to the reference category
# AAA rating has coefficient 1.2
# CCC reference category 
# means AAA rating e**1.2 = 3.3 times greater odds of being a non-defaulting borrower than the reference category
# if BBB rating has coefficient 0.8 means e**0.8 = 2.2 times greater odds of being non-default than reference category
# can use this relationship to compare odds between categories
# 3.3/2.2 = 1.5, AAA 1.5 times greater odds of being non-defaulting borrower than BBB
# odds between categories is e**(B0 - B1) exponent to difference of coefficients

In [32]:
inputs_train.to_csv(r'D:\Datasets\LendingClub\inputs_train_with_ref_cat.csv')
inputs_test.to_csv(r'D:\Datasets\LendingClub\inputs_test_with_ref_cat.csv')