In [22]:
# Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso

%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt

# Set seed for any random operations
random.seed(1)

In [23]:
# hyperparameters
alpha = 0.05

In [25]:
# load data
train = pd.read_csv("data/sorted_train0314.csv")
test = pd.read_csv("data/sorted_test0314.csv")
val = pd.read_csv("data/sorted_val0314.csv")

train = train[train.LoanStatus == "CHGOFF"]
test = test[test.LoanStatus == "CHGOFF"]
val = val[val.LoanStatus == "CHGOFF"]

# filtering for the columns you don't want to include for your model
c = train.columns.tolist()
dropped_columns = ['LoanStatus', 'GrossChargeOffAmount', 'ChargeOffDate', 'BorrZip', 'CDC_Zip', 'BorrCity',
                   'CDC_City', 'ThirdPartyLender_City', 'ProjectCounty', 'ApprovalDate']
for col in dropped_columns:
    c.remove(col)

# seperate data into covariates and targets
x_train = train[c]
x_test = test[c]
x_val = val[c]

x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
x_val = pd.get_dummies(x_val)

def add_missing_columns(d1, d2):
    missing_cols_2 = (set(d1.columns) - set(d2.columns))
    missing_cols_1 = (set(d2.columns) - set(d1.columns))
    for c in missing_cols_2:
        d2[c] = 0
    for c in missing_cols_1:
        d1[c] = 0
    return d1, d2

x_train, x_test = add_missing_columns(x_train, x_test)
x_train, x_val = add_missing_columns(x_train, x_val)
x_test, x_val = add_missing_columns(x_test, x_val)

print x_train.shape

(5340, 264)


In [29]:
# create a new column for the loss ratio
y_train = np.minimum(train.GrossChargeOffAmount.values/train.GrossApproval.values,1.)
y_test = np.minimum(test.GrossChargeOffAmount.values/test.GrossApproval.values,1.)
y_val = np.minimum(val.GrossChargeOffAmount.values/val.GrossApproval.values,1.)

print y_train[:6]

[0.         0.74289104 0.80586228 1.         0.75090533 0.91868092]


In [39]:
# hyperparameter search
alpha_range = [2**i for i in range(-5,5)]
tol_range = [10**i for i in range(-8, -1)]
loss_results = {}
for alpha in alpha_range:
    for tol in tol_range:
        model = Lasso(alpha = alpha, tol=tol)
        model.fit(x_train, y_train)
        loss_results[np.sum((model.predict(x_test) - y_test)**2)/len(y_test)] = (alpha, tol)
opt_loss = np.min(loss_results.keys())
opt_setting = loss_results[opt_loss]

print opt_setting, ":", opt_loss

(0.25, 0.01) : 0.05625732820492848


In [40]:
opt_alpha, opt_tol = opt_setting

# Fit to a baseline linear regression model
model = Lasso(alpha = opt_alpha, tol = opt_tol)
model.fit(x_train, y_train)

# Test validity
print(model.score(x_train,y_train))
l2_loss = np.sum((model.predict(x_test) - y_test)**2)/len(y_test)
print(loss)

0.3496702319833837
0.05885626762883776


In [None]:
# LINK: https://towardsdatascience.com/simple-and-multiple-linear-regression-in-python-c928425168f9

In [None]:
# NOTE: When this is a good model, move this code into the other loss notebook in order to run 
# everything to completion

In [16]:
# # Read in train dataset from hazard model pre-processing because we want the duration instead of gross approval date
# # (TODO: replace with final train data.)
# print "Number of train points:", train_data.shape[0]

# # Only want defaulted loans
# train_data = train_data[train_data.LoanStatus == 'CHGOFF']
# print "Number of defaulted train points:", train_data.shape[0]

# train_data.head()

Number of train points: 43844
Number of defaulted train points: 5340


Unnamed: 0,BorrCity,BorrState,BorrZip,CDC_City,CDC_State,CDC_Zip,ThirdPartyLender_City,ThirdPartyLender_State,GrossApproval,ApprovalDate,...,Yearly_Unemployment_Rate,Log_GrossApproval_Norm,Log_Yearly_Unemployment_Rate_Norm,Log_HPI_Norm,ThirdPartyDollars_Norm,TermMultipleYear,RepeatBorrower,BankStateneqBorrowerState,ProjectStateneqBorrowerState,2DigitNaics
9,WINCHESTER,NH,3470,Portsmouth,NH,3801.0,Peterborough,NH,117000.0,2005-09-09,...,3.6,-1.326186,-1.202145,1.205491,-0.069791,True,0,False,False,44
32,PRESCOTT VALLEY,AZ,86314,Tuscon,AZ,85711.0,MISSING,MISSING,670000.0,2002-03-08,...,6.1,0.910007,0.656163,-0.044474,-0.377878,True,0,False,False,72
51,NORTH TONAWANDA,NY,14120,Albany,NY,12207.0,MISSING,MISSING,167000.0,1996-09-30,...,6.3,-0.870235,0.769845,-0.406093,-0.377878,True,0,False,False,71
54,PITTSBURGH,PA,15203,Pittsburgh,PA,15203.0,MISSING,MISSING,127000.0,2000-08-11,...,4.1,-1.221093,-0.74386,-0.487009,-0.377878,True,0,False,False,MI
56,COLONIE,NY,12205,Albany,NY,12207.0,MISSING,MISSING,750000.0,1997-09-29,...,6.4,1.054544,0.82534,-0.380881,-0.377878,True,0,False,False,MI
