In [235]:
# Data and Stats packages
import numpy as np
import pandas as pd
import re
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn import metrics, datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler



# Visualization packages
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['figure.figsize'] = (13.0, 6.0)

# Other
import itertools

# Aesthetic settings
from IPython.display import display
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 500)
sns.set_style('whitegrid')
sns.set_context('talk')

#Suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [236]:
#import data
df_07to11 = pd.read_csv('./Data/LoanStats3a.csv',header=1)
df_2016Q1 = pd.read_csv('./Data/LoanStats_2016Q1.csv',header=1)
df_2016Q2 = pd.read_csv('./Data/LoanStats_2016Q2.csv',header=1)
df_2016Q3 = pd.read_csv('./Data/LoanStats_2016Q3.csv',header=1)
df_2016Q4 = pd.read_csv('./Data/LoanStats_2016Q4.csv',header=1)

In [237]:
#concat quarterly dfs for 2016
df_16 = pd.concat([df_2016Q1,df_2016Q2,df_2016Q3,df_2016Q4])

In [238]:
#figure out index when 2007 data begins
not07 = True
i = 0
while(not07):
    val = df_07to11['issue_d'].values[i][-2:]  
    if val == '07':
        print(i)
        not07 = False
    i+=1

39533


In [239]:
#drop all data points before this (2008-2011 data)
df_07 = df_07to11.drop(np.arange(39533))

In [240]:
#function to clean datasets
def clean(df):
    preds = ['loan_status'
         ,'loan_amnt'
         ,'funded_amnt'
         ,'term'
         ,'int_rate'
         ,'installment'
         ,'emp_length'
         ,'home_ownership'
         ,'annual_inc'
         ,'issue_d'
         ,'zip_code'
         ,'addr_state'
         ,'dti'
         ,'delinq_2yrs'
         ,'earliest_cr_line'
         ,'inq_last_6mths'
         ,'mths_since_last_delinq'
         ,'open_acc'
         ,'pub_rec'
         ,'revol_bal'
         ,'revol_util'
         ,'total_acc'
        ]

    states = df['addr_state'].unique().tolist()
    
    df = df[preds]

    df['term'] = df['term'].str.rstrip(' months').astype('float')
    df['int_rate'] = df['int_rate'].str.rstrip('%').astype('float') / 100.0
    df['emp_length'] = df['emp_length'].str.strip('< + years').astype('float')
    df['issue_d'] = pd.to_datetime(df['issue_d'])
    df['zip_code'] = df['zip_code'].str.replace('xx','00').astype('float')
    df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])
    
    df['length_credit_history'] = (df['issue_d']-df['earliest_cr_line']).astype('timedelta64[D]')/30.44
    df['mths_since_last_delinq'] = np.where(df['mths_since_last_delinq'].isna() , df['length_credit_history'], df['mths_since_last_delinq'])
    df['revol_util'] = df['revol_util'].str.rstrip('%').astype('float') / 100.0
    df = pd.get_dummies(df, columns = ['home_ownership'] )
    df = pd.get_dummies(df, columns = ['addr_state'] )

    df['loan_status'] = np.where(df['loan_status'] == 'Fully Paid' , 'paid', df['loan_status'])
    df['loan_status'] = np.where(df['loan_status'] == 'Charged Off' , 'default', df['loan_status'])
    df['loan_status'] = np.where(df['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid' , 'paid', df['loan_status'])
    df['loan_status'] = np.where(df['loan_status'] == 'Does not meet the credit policy. Status:Charged Off' , 'default', df['loan_status'])
    
    preds.append('length_credit_history')

    df = df.dropna()
    return df

In [241]:
#clean the dfs
df_07 = clean(df_07)
df_16 = clean(df_16)

In [242]:
#Make Loan Status Binary --> 1 for not default, 0 for default
df_07['loan_status'] = (pd.Series(np.where(df_07.loan_status == 'default', 0, 1), df_07.index)).values

In [243]:
df_16['loan_status'] = (pd.Series(np.where(df_16.loan_status == 'default', 0, 1), df_16.index)).values

In [244]:
#Remove datetime columns and interest rates
cols_to_remove = ['int_rate', 'issue_d', 'earliest_cr_line']
df_07 = df_07.drop(cols_to_remove, axis = 1)
df_16 = df_16.drop(cols_to_remove, axis = 1)

In [245]:
#Split data into train and test
df_07_train, df_07_test = train_test_split(df_07, test_size=0.2, random_state=90)
df_16_train, df_16_test = train_test_split(df_16, test_size=0.2, random_state=90)

#Split to x and y
df_07_X_train = df_07_train.drop(labels = 'loan_status', axis = 1)
df_07_y_train = df_07_train.loan_status
df_07_X_test = df_07_test.drop(labels = 'loan_status', axis = 1)
df_07_y_test = df_07_test.loan_status

df_16_X_train = df_16_train.drop(labels = 'loan_status', axis = 1)
df_16_y_train = df_16_train.loan_status
df_16_X_test = df_16_test.drop(labels = 'loan_status', axis = 1)
df_16_y_test = df_16_test.loan_status

In [246]:
#Trivial Model in which all loans are accepted
triv_mod_07_train = np.sum(df_07_y_train)/len(df_07_y_train)
triv_mod_07_test = np.sum(df_07_y_test)/len(df_07_y_test)
print('2007 Data\nTrain score: '+str(triv_mod_07_train))
print('Test score: '+str(triv_mod_07_test))
triv_mod_16_train = np.sum(df_16_y_train)/len(df_16_y_train)
triv_mod_16_test = np.sum(df_16_y_test)/len(df_16_y_test)
print('2016 Data\nTrain score: '+str(triv_mod_16_train))
print('Test score: '+str(triv_mod_16_test))

2007 Data
Train score: 0.7292912040990607
Test score: 0.7406143344709898
2016 Data
Train score: 0.8649847885797337
Test score: 0.8630602667849093


In [260]:
#Logistiv regression on 2007 data
log_mod07 = LogisticRegressionCV().fit(df_07_X_train, df_07_y_train.values)

log_mod07_train_score = log_mod07.score(df_07_X_train,df_07_y_train.values)
log_mod07_test_score =log_mod07.score(df_07_X_test,df_07_y_test.values)
print("The accuracy of Logistic Regression Model on Training Set is ", log_mod07_train_score)
print("The accuracy of Logistic Regression Model on Testing Set is ", log_mod07_test_score)

The accuracy of Logistic Regression Model on Training Set is  0.7301451750640479
The accuracy of Logistic Regression Model on Testing Set is  0.7406143344709898


In [259]:
#Logistic regression on 2016 data
log_mod16 = LogisticRegressionCV().fit(df_16_X_train, df_16_y_train.values)

log_mod16_train_score = log_mod16.score(df_16_X_train, df_16_y_train.values)
log_mod16_test_score = log_mod16.score(df_16_X_test, df_16_y_test.values)
print("The accuracy of Logistic Regression Model on Training Set is ", log_mod16_train_score)
print("The accuracy of Logistic Regression Model on Testing Set is ", log_mod16_test_score)

The accuracy of Logistic Regression Model on Training Set is  0.864981709344862
The accuracy of Logistic Regression Model on Testing Set is  0.8630479498454224


In [261]:
#More Modelzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz