In [34]:
import pandas as pd
import pandas_profiling as pp
from sklearn.model_selection import train_test_split # to make the test-train split of the data
from sklearn.linear_model import LogisticRegression # for the Logistic Regression model
from sklearn.metrics import roc_auc_score # to calculate an ROC value of our model

In [35]:
# read in the clean data
loans_data_clean = pd.read_csv("../data/clean_data/lending_club_loans_cleaned.csv")
# make a working copy
df = loans_data_clean.copy()

In [36]:
df.shape

(40869, 14)

In [37]:
# pp.ProfileReport(df)

In [38]:
# decided to drop 'int_rate' as it is highly-correlated with fico
# decided to drop 'sub_grade' as it will create many dummy variables, and it is equivalent to fico
df.drop(['int_rate'], axis = 1, inplace = True)
df.drop(['sub_grade'], axis = 1, inplace = True)

In [39]:
df.columns

Index(['defaulted', 'loan_amnt', 'term', 'dti', 'fico', 'pub_rec',
       'emp_length', 'home_ownership', 'verification_status', 'addr_state',
       'annual_inc', 'issue_yr'],
      dtype='object')

In [40]:
df = pd.get_dummies(df, drop_first = True)

df.head()

Unnamed: 0,defaulted,loan_amnt,dti,fico,pub_rec,annual_inc,issue_yr,term_60 months,emp_length_10+ years,emp_length_2 years,...,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,False,5000.0,27.65,735.0,0,24000,2011,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,True,2500.0,1.0,740.0,0,30000,2011,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,2400.0,8.72,735.0,0,12252,2011,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,False,10000.0,20.0,690.0,0,49200,2011,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,False,5000.0,11.2,730.0,0,36000,2011,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# split our data into the predictor and outcome variables
loans_predictors = df.drop(columns = "defaulted")
loans_response = df["defaulted"]

In [42]:
# we are interested in a prediction model, so we will create a test and train split of the data
loans_pred_train, loans_pred_test, loans_resp_train, loans_resp_test = \
train_test_split(loans_predictors, loans_response, test_size = 0.1, random_state = 7)
# the training data is 90% of our starting dataset, the test data is 10% of our starting dataset
# we use 'random_state = 7' in order to ensure the random selection gives reproducible results

In [43]:
# Logistic regression is the appropriate regression analysis to conduct when ...
# ...the dependent variable is dichotomous (binary). 
# building the Logistic Regression model

model = LogisticRegression()

model.fit(loans_pred_train, loans_resp_train)

LogisticRegression()

In [44]:
# the model accuracy on the training data
model.score(loans_pred_train, loans_resp_train)

0.8481322385949649

In [45]:
# the accuracy on the test data
model.score(loans_pred_test, loans_resp_test)

0.857841937851725

In [46]:
# to prepare for calculation of the ROC, let's get the predicted proabilities of default from the model
pred_test = model.predict_proba(loans_pred_test)
# we have a 2d array where each row sums to 1, but we only want the second value (whether a loan defaults or not)
pred_test = pred_test[:, 1]
# now to calculate the ROC
roc_auc_score(y_true = loans_resp_test, y_score = pred_test)
# the AUC score of 0.65 indicates some predictive power of the model

0.6419248831361629