In [611]:
import pandas as pd
import pandas_profiling as pp
from sklearn.model_selection import train_test_split # to make the test-train split of the data
from sklearn.linear_model import LogisticRegression # for the Logistic Regression model
from sklearn.metrics import roc_auc_score # to calculate an ROC value of our model
from sklearn.metrics import confusion_matrix # to make confusion matrix

In [612]:
# read in the clean data
loans_data_clean = pd.read_csv("../data/clean_data/lending_club_loans_cleaned.csv")
# make a working copy
df = loans_data_clean.copy()

In [613]:
df.shape

(40856, 14)

In [614]:
# decided to drop 'int_rate' as it is highly-correlated with fico
# decided to drop 'sub_grade' as it will create many dummy variables, and it is equivalent to fico
df.drop(['int_rate'], axis = 1, inplace = True)
# df.drop(['fico'], axis = 1, inplace = True)
df.drop(['sub_grade'], axis = 1, inplace = True)
df.drop(['pub_rec'], axis = 1, inplace = True)
# df.drop(['term'], axis = 1, inplace = True)
df.drop(['addr_state'], axis = 1, inplace = True)
df.drop(['dti'], axis = 1, inplace = True)
df.drop(['home_ownership'], axis = 1, inplace = True)
df.drop(['emp_length'], axis = 1, inplace = True)
df.drop(['annual_inc'], axis = 1, inplace = True)
df.drop(['loan_amnt'], axis = 1, inplace = True)
df.drop(['verification_status'], axis = 1, inplace = True)
df.drop(['issue_yr'], axis = 1, inplace = True)

In [615]:
df.shape

(40856, 3)

In [616]:
df.columns

Index(['defaulted', 'term', 'fico'], dtype='object')

In [617]:
#pp.ProfileReport(df)

In [618]:
df = pd.get_dummies(df, drop_first = True)

df.head()

Unnamed: 0,defaulted,fico,term_60 months
0,False,735.0,0
1,True,740.0,1
2,False,735.0,0
3,False,690.0,0
4,False,730.0,0


In [619]:
# split our data into the predictor and outcome variables
loans_predictors = df.drop(columns = "defaulted")
loans_response = df["defaulted"]

In [620]:
# we are interested in a prediction model, so we will create a test and train split of the data
loans_pred_train, loans_pred_test, loans_resp_train, loans_resp_test = \
train_test_split(loans_predictors, loans_response, test_size = 0.1, random_state = 7)
# the training data is 90% of our starting dataset, the test data is 10% of our starting dataset
# we use 'random_state = 7' in order to ensure the random selection gives reproducible results

In [621]:
# Logistic regression is the appropriate regression analysis to conduct when ...
# ...the dependent variable is dichotomous (binary). 
# building the Logistic Regression model

model = LogisticRegression()

model.fit(loans_pred_train, loans_resp_train)

LogisticRegression()

In [622]:
# the model accuracy on the training data
model.score(loans_pred_train, loans_resp_train)

0.8493608920315474

In [623]:
# the accuracy on the test data
model.score(loans_pred_test, loans_resp_test)

0.8463044542339696

In [624]:
# to prepare for calculation of the ROC, let's get the predicted proabilities of default from the model
pred_test = model.predict_proba(loans_pred_test)
# we have a 2d array where each row sums to 1, but we only want the second value (whether a loan defaults or not)
pred_test = pred_test[:, 1]
# now to calculate the ROC
roc_auc_score(y_true = loans_resp_test, y_score = pred_test)
# the AUC score of 0.62 indicates some predictive power of the model

0.6626333564189749

In [625]:

# model = LogisticRegression()
fit = model.fit(loans_pred_train, loans_resp_train)

pd.DataFrame(
  {
  "Variable" : loans_pred_train.columns.values,
  "Coefficient" : fit.coef_[0],
  "Standardised": np.std(loans_pred_train, 0)*fit.coef_[0]
  }
)

Unnamed: 0,Variable,Coefficient,Standardised
fico,fico,-0.01281,-0.462515
term_60 months,term_60 months,0.858591,0.371436


In [626]:
# confusion_matrix(loans_resp_test, loans_pred_train)