In [1]:
# Import relevant libraries
from scipy import stats
import scipy as sp
import pandas as pd
import numpy as np
import random

%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt

# Set seed for any random operations
random.seed(1)

In [2]:
# Read in train dataset from hazard model pre-processing because we want the duration instead of gross approval date
# (TODO: replace with final train data.)
train_data = pd.read_csv("random_hazard_train0314.csv")
test_data = pd.read_csv("random_hazard_test0314.csv")
print "Number of train points:", train_data.shape[0]

# Only want defaulted loans
train_default = train_data[train_data['Default?'] == 1].copy()
print "Number of defaulted train points:", train_default.shape[0]

test_default = test_data[test_data['Default?'] == 1].copy()
print "Number of defaulted test points:", test_default.shape[0]

train_default.head()

Number of train points: 43844
Number of defaulted train points: 7177
Number of defaulted test points: 848


Unnamed: 0,BorrCity,BorrState,BorrZip,CDC_City,CDC_State,CDC_Zip,ThirdPartyLender_City,ThirdPartyLender_State,GrossApproval,ApprovalDate,...,Log_Yearly_Unemployment_Rate_Norm,Log_HPI_Norm,ThirdPartyDollars_Norm,TermMultipleYear,RepeatBorrower,BankStateneqBorrowerState,ProjectStateneqBorrowerState,2DigitNaics,DaysToDefault,Default?
0,ANDERSON,CA,96007,Denver,CO,MISSING,MISSING,MISSING,658000.0,1999-08-26,...,0.093654,0.565254,-0.377878,True,0,True,False,MI,4907,1
7,FAIRHOPE,AL,36532,Mobile,AL,36693.0,MISSING,MISSING,192000.0,2001-01-04,...,0.025228,-0.72509,-0.377878,True,0,False,False,72,798,1
8,LEWISVILLE,TX,MISSING,Fort Worth,TX,MISSING,GRAPEVINE,TX,1338000.0,2006-11-22,...,-0.576027,-0.225936,2.300576,True,0,False,False,71,728,1
20,SEBASTOPOL,CA,95472,San Francisco,CA,94133.0,SANTA ROSA,CA,571000.0,2006-04-28,...,-0.115744,2.748243,0.547837,True,0,False,False,33,2580,1
21,BELLEFONTAINE,OH,43311,Columbus,OH,43204.0,SALT LAKE CITY,UT,540000.0,2005-01-18,...,0.538692,-0.11179,0.890075,True,0,False,False,71,2257,1


In [3]:
# Create new column with just loss ratio
loss_ratios = []
for index, row in train_default.iterrows():
    loss_ratio = row['GrossChargeOffAmount'] / row['GrossApproval']
    if loss_ratio >= 1.0:    # Cap at 1.0 because it doesn't really make sense to lose more than what was loaned
        loss_ratio = 1.0
    loss_ratios.append(loss_ratio)

train_default['LossRatio'] = pd.Series(loss_ratios).values
train_default['LossRatio'] = train_default['LossRatio'].astype(float)

loss_ratios = []
for index, row in test_default.iterrows():
    loss_ratio = row['GrossChargeOffAmount'] / row['GrossApproval']
    if loss_ratio >= 1.0:    # Cap at 1.0 because it doesn't really make sense to lose more than what was loaned
        loss_ratio = 1.0
    loss_ratios.append(loss_ratio)

test_default['LossRatio'] = pd.Series(loss_ratios).values
test_default['LossRatio'] = test_default['LossRatio'].astype(float)

In [4]:
train_default.head()

Unnamed: 0,BorrCity,BorrState,BorrZip,CDC_City,CDC_State,CDC_Zip,ThirdPartyLender_City,ThirdPartyLender_State,GrossApproval,ApprovalDate,...,Log_HPI_Norm,ThirdPartyDollars_Norm,TermMultipleYear,RepeatBorrower,BankStateneqBorrowerState,ProjectStateneqBorrowerState,2DigitNaics,DaysToDefault,Default?,LossRatio
0,ANDERSON,CA,96007,Denver,CO,MISSING,MISSING,MISSING,658000.0,1999-08-26,...,0.565254,-0.377878,True,0,True,False,MI,4907,1,0.578868
7,FAIRHOPE,AL,36532,Mobile,AL,36693.0,MISSING,MISSING,192000.0,2001-01-04,...,-0.72509,-0.377878,True,0,False,False,72,798,1,0.0
8,LEWISVILLE,TX,MISSING,Fort Worth,TX,MISSING,GRAPEVINE,TX,1338000.0,2006-11-22,...,-0.225936,2.300576,True,0,False,False,71,728,1,0.997633
20,SEBASTOPOL,CA,95472,San Francisco,CA,94133.0,SANTA ROSA,CA,571000.0,2006-04-28,...,2.748243,0.547837,True,0,False,False,33,2580,1,0.966783
21,BELLEFONTAINE,OH,43311,Columbus,OH,43204.0,SALT LAKE CITY,UT,540000.0,2005-01-18,...,-0.11179,0.890075,True,0,False,False,71,2257,1,0.715426


In [5]:
# Filter columns we don't want first
c = train_default.columns.tolist()
c.remove('LoanStatus')
c.remove('ChargeOffDate')
c.remove('GrossChargeOffAmount')
c.remove('BorrZip')
c.remove('CDC_Zip')
c.remove('BorrCity')

c.remove('BorrState')
c.remove('CDC_State')
c.remove('ThirdPartyLender_State')
c.remove('ProjectState')

c.remove('CDC_City')
c.remove('ProjectCounty')
c.remove('ThirdPartyLender_City')
c.remove('ApprovalDate')

c.remove('GrossApproval')
c.remove('Log_GrossApproval_Norm')
c.remove('Default?')
print c

['ApprovalFiscalYear', 'DeliveryMethod', 'subpgmdesc', 'TermInMonths', 'BusinessType', 'SP500_Yearly_Return', 'CPI', 'Yearly_Unemployment_Rate', 'Log_Yearly_Unemployment_Rate_Norm', 'Log_HPI_Norm', 'ThirdPartyDollars_Norm', 'TermMultipleYear', 'RepeatBorrower', 'BankStateneqBorrowerState', 'ProjectStateneqBorrowerState', '2DigitNaics', 'DaysToDefault', 'LossRatio']


In [6]:
# Get only columns/features we want
x_train = train_default[c]
print x_train.columns.tolist()
x_train = pd.get_dummies(x_train)
print x_train.shape

x_test = test_default[c]
print x_test.columns.tolist()
x_test = pd.get_dummies(x_test)
print x_test.shape

['ApprovalFiscalYear', 'DeliveryMethod', 'subpgmdesc', 'TermInMonths', 'BusinessType', 'SP500_Yearly_Return', 'CPI', 'Yearly_Unemployment_Rate', 'Log_Yearly_Unemployment_Rate_Norm', 'Log_HPI_Norm', 'ThirdPartyDollars_Norm', 'TermMultipleYear', 'RepeatBorrower', 'BankStateneqBorrowerState', 'ProjectStateneqBorrowerState', '2DigitNaics', 'DaysToDefault', 'LossRatio']
(7177, 50)
['ApprovalFiscalYear', 'DeliveryMethod', 'subpgmdesc', 'TermInMonths', 'BusinessType', 'SP500_Yearly_Return', 'CPI', 'Yearly_Unemployment_Rate', 'Log_Yearly_Unemployment_Rate_Norm', 'Log_HPI_Norm', 'ThirdPartyDollars_Norm', 'TermMultipleYear', 'RepeatBorrower', 'BankStateneqBorrowerState', 'ProjectStateneqBorrowerState', '2DigitNaics', 'DaysToDefault', 'LossRatio']
(848, 45)


In [16]:
# To get consistent feature dimensions for both train and test dataset
def add_missing_dummy_columns(d1, d2):
    missing_cols = set(d1.columns) - set(d2.columns)
    for c in missing_cols:
        d2[c] = 0
    return d2

print 'before fix columns: '
print x_train.shape
print x_test.shape

def fix_columns(x_train, x_test):  

    x_test = add_missing_dummy_columns(x_train, x_test)

    extra_cols = set(x_test.columns) - set(x_train.columns)
    x_test = x_test[x_train.columns]
    return x_test

x_test = fix_columns(x_train, x_test)
print 'after fix columns: '
print x_train.shape
print x_test.shape

print x_train.columns.tolist()

before fix columns: 
(7177, 50)
(848, 50)
after fix columns: 
(7177, 50)
(848, 50)
['ApprovalFiscalYear', 'TermInMonths', 'SP500_Yearly_Return', 'CPI', 'Yearly_Unemployment_Rate', 'Log_Yearly_Unemployment_Rate_Norm', 'Log_HPI_Norm', 'ThirdPartyDollars_Norm', 'TermMultipleYear', 'RepeatBorrower', 'BankStateneqBorrowerState', 'ProjectStateneqBorrowerState', 'DaysToDefault', 'LossRatio', 'DeliveryMethod_504', 'DeliveryMethod_504REFI', 'DeliveryMethod_ALP', 'DeliveryMethod_PCLP', 'subpgmdesc_504 Refinance', 'subpgmdesc_Sec. 504 - Delta loans, funded 9/26/95', 'subpgmdesc_Sec. 504 - Loan Guarantees - Private Sector Financed', 'subpgmdesc_Sec. 504 - Premier Certified Lender Program', 'BusinessType_CORPORATION', 'BusinessType_INDIVIDUAL', 'BusinessType_MISSING', 'BusinessType_PARTNERSHIP', '2DigitNaics_11', '2DigitNaics_21', '2DigitNaics_22', '2DigitNaics_23', '2DigitNaics_31', '2DigitNaics_32', '2DigitNaics_33', '2DigitNaics_42', '2DigitNaics_44', '2DigitNaics_45', '2DigitNaics_48', '2DigitN

In [8]:
from pandas.core import datetools
import statsmodels.api as sm

#  Split the explanatory variables and the variable we want to predict
X = x_train.drop(columns=['LossRatio'])
Y = x_train[['LossRatio']].copy()

# Fit to a baseline linear regression model
loss_model = sm.OLS(Y, X.astype(float)).fit()

  if __name__ == '__main__':


In [9]:
# Print out the statistics
loss_model.summary()

0,1,2,3
Dep. Variable:,LossRatio,R-squared:,0.428
Model:,OLS,Adj. R-squared:,0.425
Method:,Least Squares,F-statistic:,124.1
Date:,"Sat, 17 Mar 2018",Prob (F-statistic):,0.0
Time:,14:06:33,Log-Likelihood:,-634.07
No. Observations:,7177,AIC:,1356.0
Df Residuals:,7133,BIC:,1659.0
Df Model:,43,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ApprovalFiscalYear,0.0662,0.002,29.738,0.000,0.062,0.071
TermInMonths,0.0006,0.000,4.536,0.000,0.000,0.001
SP500_Yearly_Return,-0.0905,0.024,-3.772,0.000,-0.138,-0.043
CPI,-0.2258,0.053,-4.277,0.000,-0.329,-0.122
Yearly_Unemployment_Rate,-0.0254,0.012,-2.031,0.042,-0.050,-0.001
Log_Yearly_Unemployment_Rate_Norm,0.0395,0.018,2.144,0.032,0.003,0.076
Log_HPI_Norm,-0.0083,0.004,-1.887,0.059,-0.017,0.000
ThirdPartyDollars_Norm,0.0025,0.002,1.166,0.244,-0.002,0.007
TermMultipleYear,-0.2239,0.078,-2.881,0.004,-0.376,-0.072

0,1,2,3
Omnibus:,1356.914,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2537.326
Skew:,-1.172,Prob(JB):,0.0
Kurtosis:,4.729,Cond. No.,1.39e+16


In [10]:
X_test = x_test.drop(columns=['LossRatio'])
Y_test = x_test[['LossRatio']].copy()

In [12]:
# Make predictions
predictions = loss_model.predict(X_test.astype(float))
print(predictions)
num_vals = Y_test.shape[0]
test_vals = np.asarray(Y_test['LossRatio'].values.tolist())
pred_vals = np.asarray(predictions.values.tolist())
print "Test error:", abs(sum(test_vals - pred_vals))

predictions = loss_model.predict(X.astype(float))
num_vals = Y.shape[0]
train_vals = np.asarray(Y['LossRatio'].values.tolist())
pred_vals = np.asarray(predictions.values.tolist())
print "Train error:", abs(sum(train_vals - pred_vals))

6       0.847891
12      0.511862
20      0.881422
60      0.738839
77      0.890252
78      0.745231
86      0.874031
118     0.225872
127     0.797291
130     0.796150
134     0.336018
159     0.873301
163     0.417976
169     0.469790
171     0.387112
187    -0.001151
189     0.792153
194     0.438429
204     0.857606
218     0.641160
219     0.798641
223     0.802576
229     0.793886
234     0.667496
235     0.902497
237     0.929195
240     0.754016
249     0.858677
251     0.895985
274     0.769703
          ...   
5322    0.885228
5324    0.412295
5332    0.787874
5333    0.831845
5336    0.827864
5339    0.793184
5347    0.745392
5354    0.490295
5356    0.863526
5358   -0.163313
5359    0.761898
5363    0.916092
5365    0.236968
5381    0.768511
5391    0.436100
5401    0.677926
5402    0.835956
5404    0.792331
5410    0.728989
5419    0.682914
5436    0.864622
5438    0.809975
5448    0.678083
5451    0.886041
5453    0.895771
5465    0.812983
5468    0.859190
5473    0.8340

In [13]:
# Store model
%store loss_model

Stored 'loss_model' (RegressionResultsWrapper)


In [None]:
# LINK: https://towardsdatascience.com/simple-and-multiple-linear-regression-in-python-c928425168f9

In [None]:
# NOTE: When this is a good model, move this code into the other loss notebook in order to run 
# everything to completion

In [56]:
# Try Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 500, random_state = 42)

# Train the model on training data
print "Fitting model..."
rf.fit(X, Y.values.ravel())

# Predict
print "Making predictions..."
predictions_rf = rf.predict(X_test)
errors = abs(predictions - Y_test)

# Print out the mean absolute error (mae)
print 'Mean Absolute Error:', np.mean(errors)

Fitting model...
Making predictions...
Mean Absolute Error: 6           NaN
12          NaN
20          NaN
60          NaN
77          NaN
78          NaN
86          NaN
118         NaN
127         NaN
130         NaN
134         NaN
159         NaN
163         NaN
169         NaN
171         NaN
187         NaN
189         NaN
194         NaN
204         NaN
218         NaN
219         NaN
223         NaN
229         NaN
234         NaN
235         NaN
237         NaN
240         NaN
249         NaN
251         NaN
274         NaN
             ..
5324        NaN
5332        NaN
5333        NaN
5336        NaN
5339        NaN
5347        NaN
5354        NaN
5356        NaN
5358        NaN
5359        NaN
5363        NaN
5365        NaN
5381        NaN
5391        NaN
5401        NaN
5402        NaN
5404        NaN
5410        NaN
5419        NaN
5436        NaN
5438        NaN
5448        NaN
5451        NaN
5453        NaN
5465        NaN
5468        NaN
5473        NaN
5474        