In [1]:
import pandas as pd
import numpy as np

In [19]:
# train = pd.read_csv("output/sorted_train0310.csv")
# test = pd.read_csv("output/sorted_test0310.csv")
# val = pd.read_csv("output/sorted_val0310.csv")

train = pd.read_csv("output/random_train0310.csv")
test = pd.read_csv("output/random_test0310.csv")
val = pd.read_csv("output/random_val0310.csv")

In [16]:
# Compute some statistics
chargeoff_count = (train['LoanStatus'] == "CHGOFF").sum()
print 'Chargeoffs:', chargeoff_count
print 'Percentage of loans that defaulted:', float(chargeoff_count)/len(train)

chargeoff_count = (test['LoanStatus'] == "CHGOFF").sum()
print 'Chargeoffs:', chargeoff_count
print 'Percentage of loans that defaulted:', float(chargeoff_count)/len(test)

Chargeoffs: 7124
Percentage of loans that defaulted: 0.162481468811
Chargeoffs: 891
Percentage of loans that defaulted: 0.162561576355


In [21]:
# if import from sorted data - need to downsample train dataset to be more balanced
# can call the following function multiple times for decreasing in log term
# dropping PIF samples with 50% probability

def drop_half(train):
    print 'before drop: ', train.shape[0]
    dropidx = []
    for idx, val in train['LoanStatus'].iteritems():
        if val != 'CHGOFF':
            rand = np.random.rand()
            if rand < 0.5:
                dropidx.append(idx)
    train = train.drop(dropidx)
    print 'after drop: ', train.shape[0]
    return train
    
train = drop_half(train)
# test = drop_half(test)

chargeoff_count = (train['LoanStatus'] == "CHGOFF").sum()
print 'Chargeoffs:', chargeoff_count
print 'Percentage of loans that defaulted:', float(chargeoff_count)/len(train)

chargeoff_count = (test['LoanStatus'] == "CHGOFF").sum()
print 'Chargeoffs:', chargeoff_count
print 'Percentage of loans that defaulted:', float(chargeoff_count)/len(test)

before drop:  43845
after drop:  25383
before drop:  5481
after drop:  3186
Chargeoffs: 7124
Percentage of loans that defaulted: 0.280660284442
Chargeoffs: 891
Percentage of loans that defaulted: 0.279661016949


In [22]:
# filtering for the columns you don't want to include for your model
c = train.columns.tolist()
c.remove('LoanStatus')
c.remove('ChargeOffDate')
c.remove('GrossChargeOffAmount')
c.remove('BorrZip')
c.remove('CDC_Zip')
c.remove('BorrCity')
c.remove('CDC_City')
c.remove('ThirdPartyLender_City')
print c

['BorrState', 'CDC_State', 'ThirdPartyLender_State', 'ApprovalDate', 'ApprovalFiscalYear', 'DeliveryMethod', 'subpgmdesc', 'TermInMonths', 'ProjectCounty', 'ProjectState', 'BusinessType', 'SP500_Yearly_Return', 'CPI', 'Log_GrossApproval_Norm', 'Log_HPI_Norm', 'ThirdPartyDollars_Norm', 'TermMultipleYear', 'RepeatBorrower', 'BankStateneqBorrowerState', 'ProjectStateneqBorrowerState', '2DigitNaics']


In [23]:
x_train = train[c]
x_test = test[c]

y_train_logit = train['LoanStatus']
y_test_logit = test['LoanStatus']

# hazard_target = train['ChargeOffDate']
# amount_target = train['GrossChargeOffAmount']

# convert categorical variables to dummy variables
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
# val = pd.get_dummies(val)

In [24]:
# to get consistent feature dimensions for both train and test dataset
def add_missing_dummy_columns(d1, d2):
    missing_cols = set(d1.columns) - set(d2.columns)
    for c in missing_cols:
        d2[c] = 0
    return d2

print 'before fix columns: '
print x_train.shape
print x_test.shape

def fix_columns(x_train, x_test):  

    x_test = add_missing_dummy_columns(x_train, x_test)

    extra_cols = set(x_test.columns) - set(x_train.columns)
    x_test = x_test[x_train.columns]
    return x_test

x_test = fix_columns(x_train, x_test)
print 'after fix columns: '
print x_train.shape
print x_test.shape

before fix columns: 
(25383, 6519)
(3186, 3197)
after fix columns: 
(25383, 6519)
(3186, 6519)


In [25]:
# x_train, x_test is ready to be used!!

In [26]:
# fit to models 
# below is to an exmple to fit to logit model

# y_train_logit = np.reshape(l, (len(l),))
from sklearn.linear_model import LogisticRegression
solver = 'saga'

lr = LogisticRegression(solver=solver, 
                        multi_class= 'multinomial', 
                        C=1,
                        penalty = 'l1',
                        fit_intercept=True,
                        max_iter=10,
                        random_state=42, 
                       )
lr.fit(x_train, y_train_logit)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10, multi_class='multinomial',
          n_jobs=1, penalty='l1', random_state=42, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [27]:
# fit on training data to see if there is underfitting
y_pred_train = lr.predict(x_train)
print 'train: ', np.sum(y_pred_train == y_train_logit) *1./ y_train_logit.shape[0]

# fit on test set to see if there is overfitting
y_pred_test = lr.predict(x_test)
print 'test: ', np.sum(y_pred_test == y_test_logit) *1./ y_test_logit.shape[0]

train:  0.719221526218
test:  0.720652856246


In [28]:
from sklearn import metrics

print metrics.classification_report(y_test_logit, y_pred_test)

             precision    recall  f1-score   support

     CHGOFF       1.00      0.00      0.00       891
        PIF       0.72      1.00      0.84      2295

avg / total       0.80      0.72      0.60      3186

