In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np
import random

# Set seed for any random operations
random.seed(1)

In [2]:
# Read in train dataset
train_data = pd.read_csv("train.csv")

In [3]:
# What's the size of our dataset?
print "Number of samples:", train_data.shape[0]

# Observe the first few rows.
train_data.head()

Number of samples: 43845


Unnamed: 0.1,Unnamed: 0,BorrName,BorrStreet,BorrCity,BorrState,BorrZip,CDC_Name,CDC_Street,CDC_City,CDC_State,...,ProjectState,BusinessType,LoanStatus,ChargeOffDate,GrossChargeOffAmount,Default?,ThirdPartyDollars_Norm,GrossApproval_Norm,TermInMonths_Norm,GrossChargeOffAmount_Norm
0,66986,PSYCHIATRIC MANAGE. & COUNSEL,BEAVER BROOK OFFICE PK #B-LL/,CHELMSFORD (CHELMSFORD CENTER),MA,MISSING,Granite State Economic Develop,One Cate Street,Portsmouth,NH,...,MA,CORPORATION,PIF,,0.0,False,-1.192669,-0.775136,0.237443,-0.243725
1,82447,HOLIDAY INN,2640 N CHERRY ROAD,ROCK HILL,SC,29730,Certified Development Corporat,111 Executive Center Drive,Columbia,SC,...,SC,CORPORATION,CHGOFF,2010-09-08 00:00:00,1228386.0,True,1.853541,2.697379,0.237443,6.981681
2,45549,COUNTRY INNS & SUITES BY CARLS,2260 LINCOLN HIGHWAY,LANCASTER,PA,17602,South Eastern Economic Develop,737 Constitution Drive,Exton,PA,...,PA,CORPORATION,PIF,,0.0,False,-2.265977e-16,-0.401511,-3.890653,-0.243725
3,23121,"SANYOU TECHNOLOGY, INC.",2056 BERING DRIVE,SAN JOSE,CA,95131,"Capital Access Group, Inc.",150 California Street,San Francisco,CA,...,CA,CORPORATION,PIF,,0.0,False,-2.265977e-16,-0.20057,0.237443,-0.243725
4,47452,"COUNTRY CARE, LLC",533 SOUTH 950 WEST,FARMINGTON,UT,84025,Mountain West Small Business F,2595 East 3300 South,Salt Lake City,UT,...,UT,CORPORATION,PIF,,0.0,False,-2.265977e-16,-0.404651,0.237443,-0.243725


In [4]:
# Define functions to do some additional pre-processing
def label_as_categorical(df):
    df[['NaicsCode','BorrZip','CDC_Zip','ApprovalFiscalYear']] = df[['NaicsCode','BorrZip','CDC_Zip','ApprovalFiscalYear']].astype(str)
    return df
    
def drop_redundant_cols(df):
    return df.drop(columns=["Unnamed: 0", "BorrName", "BorrStreet", "NaicsDescription", "LoanStatus"])
    
def drop_norm_cols(df):
    return df.drop(columns=["GrossApproval_Norm", "TermInMonths_Norm", "GrossChargeOffAmount_Norm", "ThirdPartyDollars_Norm"])

def drop_interest_col(df):
    return df.drop(columns=["InitialInterestRate"])

In [5]:
# Define function to calculate the relative time to a default
from datetime import datetime

def calculate_days_to_default(approval, chargeoff, terms):
    # No default occurred, return TermInMonths in days
    if pd.isnull(chargeoff):
        return terms / 12 * 365
    
    d1 = datetime.strptime(approval.split()[0], "%Y-%m-%d")
    d2 = datetime.strptime(chargeoff.split()[0], "%Y-%m-%d")
    return (d2 - d1).days

def add_relative_time_col(df):
    DaysToDefault = []
    for index, row in df.iterrows():
        approval = row["ApprovalDate"]
        chargeoff = row["ChargeOffDate"]
        terms = row['TermInMonths']
        DaysToDefault.append(calculate_days_to_default(approval, chargeoff, terms))

    df['DaysToDefault'] = pd.Series(DaysToDefault).values

    # Remove ApprovalDate, ChargeOffDate -- we no longer need these columns
    df = df.drop(columns=["ApprovalDate", "ChargeOffDate"])

    # Convert True/False to 0/1s for Default? column
    df['Default?'] = df['Default?'].astype(int)
    return df

In [6]:
# Call our functions
train_data = label_as_categorical(train_data)
train_data = drop_redundant_cols(train_data)
train_data = drop_norm_cols(train_data)

train_data = add_relative_time_col(train_data)

train_data.head()

Unnamed: 0,BorrCity,BorrState,BorrZip,CDC_Name,CDC_Street,CDC_City,CDC_State,CDC_Zip,ThirdPartyLender_Name,ThirdPartyLender_City,...,DeliveryMethod,subpgmdesc,TermInMonths,NaicsCode,ProjectCounty,ProjectState,BusinessType,GrossChargeOffAmount,Default?,DaysToDefault
0,CHELMSFORD (CHELMSFORD CENTER),MA,MISSING,Granite State Economic Develop,One Cate Street,Portsmouth,NH,MISSING,Enterprise Bank and Trust Comp,LOWELL,...,ALP,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,624110.0,MIDDLESEX,MA,CORPORATION,0.0,0,7300
1,ROCK HILL,SC,29730,Certified Development Corporat,111 Executive Center Drive,Columbia,SC,29210.0,Federal Deposit Insurance Corp,WASHINGTON,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,721110.0,YORK,SC,CORPORATION,1228386.0,1,1575
2,LANCASTER,PA,17602,South Eastern Economic Develop,737 Constitution Drive,Exton,PA,19341.0,MISSING,MISSING,...,ALP,Sec. 504 - Loan Guarantees - Private Sector Fi...,120,721110.0,LANCASTER,PA,CORPORATION,0.0,0,3650
3,SAN JOSE,CA,95131,"Capital Access Group, Inc.",150 California Street,San Francisco,CA,94111.0,MISSING,MISSING,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,334411.0,SANTA CLARA,CA,CORPORATION,0.0,0,7300
4,FARMINGTON,UT,84025,Mountain West Small Business F,2595 East 3300 South,Salt Lake City,UT,84109.0,MISSING,MISSING,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,623110.0,DAVIS,UT,CORPORATION,0.0,0,7300


In [7]:
# Save output before fitting model just in case
train_data.to_csv("hazard_train.csv")

In [8]:
# Read in test dataset
test_data = pd.read_csv("test_with_missing.csv")
test_data = label_as_categorical(test_data)
test_data = drop_redundant_cols(test_data)
test_data = drop_norm_cols(test_data)

test_data = add_relative_time_col(test_data)

test_data.to_csv("hazard_test.csv")
test_data.head()

Unnamed: 0,BorrCity,BorrState,BorrZip,CDC_Name,CDC_Street,CDC_City,CDC_State,CDC_Zip,ThirdPartyLender_Name,ThirdPartyLender_City,...,DeliveryMethod,subpgmdesc,TermInMonths,NaicsCode,ProjectCounty,ProjectState,BusinessType,GrossChargeOffAmount,Default?,DaysToDefault
0,FORT COLLINS,CO,80525,"Colorado Lending Source, Ltd.",518 17th Street,Denver,CO,80202.0,Advantage Bank,LOVELAND,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,624410.0,LARIMER,CO,CORPORATION,18174.0,1,1689
1,MENTOR,OH,44060,Mentor Economic Assistance Cor,8500 Civic Center Boulevard,Mentor,OH,44060.0,MISSING,MISSING,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,MISSING,LAKE,OH,CORPORATION,0.0,0,7300
2,SHERIDAN,CO,80223,Denver Urban Economic Developm,140 East 19th Avenue,Denver,CO,80203.0,MISSING,MISSING,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,MISSING,ARAPAHOE,CO,CORPORATION,0.0,0,7300
3,HUNTINGTON,NY,11746,Long Island Development Corpor,45 Seaman Avenue,Bethpage,NY,11714.0,MISSING,MISSING,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,MISSING,SUFFOLK,NY,CORPORATION,0.0,0,7300
4,ELK GROVE VILLAGE,IL,60007,"SomerCor 504, Inc.",601 S. LaSalle Street,Chicago,IL,60605.0,MISSING,MISSING,...,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,333514.0,COOK,IL,CORPORATION,0.0,0,7300


In [183]:
# Build a hazard model with the data we generated/processed in R 