In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime

In [3]:
# Read the data file
df=pd.read_csv('LoanStats3b_securev1.csv',header=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Choose some meaningful and available features for predictions
df=df[['loan_amnt','term','int_rate','installment','sub_grade','emp_title','emp_length','home_ownership','annual_inc','verification_status',
        'issue_d','desc','purpose','title','zip_code','addr_state','dti','delinq_2yrs','earliest_cr_line','fico_range_low','fico_range_high','inq_last_6mths',
        'open_acc','revol_bal','revol_util','total_acc','acc_open_past_24mths','mort_acc','percent_bc_gt_75','loan_status']]


In [11]:
# Drop rows that are completely empty
df.dropna(how='all', inplace=True)

In [12]:
# Change the format of int_rate to float
df['int_rate']=df['int_rate'].apply(lambda x: float(x[:-1]))

In [13]:
# Change the format of revol_util to float
def process_revol_util(x):
    if type(x) == str:
        x=float(x[:-1])
    return x
df['revol_util']=df['revol_util'].apply(lambda x: process_revol_util(x))


In [14]:
# Fill in missing values with the mean
df['revol_util'].fillna(value=df['revol_util'].mean(), inplace=True)
df['acc_open_past_24mths'].fillna(value=df['acc_open_past_24mths'].mean(), inplace=True)
df['mort_acc'].fillna(value=df['mort_acc'].mean(), inplace=True)
df['percent_bc_gt_75'].fillna(value=df['percent_bc_gt_75'].mean(), inplace=True)

In [15]:
# Change emp_length and sub_grade from strings to integers
mapping_dict={"emp_length": {"10+ years": 10, "9 years": 9, "8 years": 8, "7 years": 7, "6 years": 6, "5 years": 5, "4 years": 4, 
                               "3 years": 3, "2 years": 2, "1 year": 1, "< 1 year": 0}, 
                "sub_grade":{"G5": 0, "G4": 1, "G3": 2, "G2": 3, "G1": 4, "F5": 5, "F4": 6, "F3": 7, "F2": 8, "F1": 9, "E5": 10,
                             "E4": 11, "E3": 12, "E2": 13, "E1": 14, "D5": 15, "D4": 16, "D3": 17, "D2": 18, "D1": 19, "C5": 20, 
                             "C4": 21, "C3": 22, "C2": 23, "C1": 24, "B5": 25, "B4": 26, "B3": 27, "B2": 28, "B1": 29, "A5": 30, 
                             "A4": 31, "A3": 32, "A2": 33, "A1": 34}}
df=df.replace(mapping_dict)


In [17]:
# Fill in missing values of emp_length based on emp_title
for i in df.index:
    if np.isnan(df.loc[i,'emp_length']):
        title=df.loc[i,'emp_title']
        if type(title)==str:
            average=df[df['emp_title']==title]['emp_length'].mean()
            df.loc[i,'emp_length']=average

In [23]:
# Drop the rows with NaN in either emp_title or emp_length
df.dropna(subset=['emp_length'], inplace=True)

In [24]:
# Change issue_d to datetime format
df['issue_d']=df['issue_d'].apply(lambda x: datetime.datetime.strptime(x, '%b-%Y'))

In [25]:
# Change earliest_cr_line to datetime format
df['earliest_cr_line']=df['earliest_cr_line'].apply(lambda x: datetime.datetime.strptime(x, '%b-%Y'))

In [26]:
# Calculate the differences in months from earliest_cr_line to issue_d
def diff_month(d1,d2):
    return (d1.year-d2.year)*12+(d1.month-d2.month)

for i in df.index:
    df.loc[i,'earliest_cr_line']=diff_month(df.loc[i,'issue_d'],df.loc[i,'earliest_cr_line'])

In [27]:
# Creat dummy variables for the rest categorical variables
dummy_df=pd.get_dummies(df[['term','home_ownership','verification_status','purpose','addr_state']])
df=pd.concat([df, dummy_df], axis=1)
df=df.drop(['term','home_ownership','verification_status','purpose','addr_state'], axis=1)

In [28]:
# Change the target variable to numbers: 1 indicates defaulted, 0 indicates paid off
def flag(x):
    if x=='Fully Paid':
        return 0
    elif x=='Charged Off':
        return 1

df['loan_status']=df['loan_status'].apply(flag)

# Drop loans with other unknown status
df.dropna(subset=['loan_status'], inplace=True)

In [29]:
# Reset the index to integers starting from 0
df.reset_index(drop=True, inplace=True)

In [30]:
df

Unnamed: 0,loan_amnt,int_rate,installment,sub_grade,emp_title,emp_length,annual_inc,issue_d,desc,title,...,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,12000.0,7.62,373.94,32,Systems Engineer,3.0,96500.0,2013-12-01,Borrower added on 12/31/13 > Bought a new ho...,Debt Consolidation and Credit Transfer,...,0,0,1,0,0,0,0,0,0,0
1,27050.0,10.99,885.46,28,Team Leadern Customer Ops & Systems,10.0,55000.0,2013-12-01,Borrower added on 12/31/13 > Combining high ...,Debt Consolidation,...,0,0,0,0,0,0,0,0,0,0
2,12000.0,11.99,398.52,27,LTC,10.0,130000.0,2013-12-01,,Debt consolidation,...,0,0,0,0,0,0,0,0,0,0
3,28000.0,7.62,872.52,32,Area Sales Manager,5.0,325000.0,2013-12-01,,Pay off other Installment loan,...,0,0,0,0,0,0,0,0,0,0
4,27600.0,19.97,730.78,15,Street Operations Supervisor,6.0,73000.0,2013-12-01,Borrower added on 12/31/13 > I had some wate...,Consolidation of debt and home improve.,...,0,0,0,0,0,0,0,0,0,0
5,12000.0,10.99,392.81,28,Project Manager,4.0,60000.0,2013-12-01,Borrower added on 12/31/13 > I would like to...,No Regrets,...,0,0,0,0,0,0,0,0,0,0
6,11100.0,14.98,384.68,22,Teacher,10.0,90000.0,2013-12-01,,Other,...,0,0,0,0,0,0,0,0,0,0
7,12000.0,13.53,407.40,25,On road manager,10.0,40000.0,2013-12-01,,Debt consolidation,...,0,0,0,0,0,0,0,0,0,0
8,9750.0,13.98,333.14,24,Medical Assistant,1.0,26000.0,2013-12-01,Borrower added on 12/31/13 > While being in ...,Debt Consilation,...,0,0,0,0,0,0,0,0,0,0
9,4800.0,10.99,157.13,28,Surgical Technician,2.0,39600.0,2013-12-01,Borrower added on 12/31/13 > Just bought a h...,For The House,...,0,0,1,0,0,0,0,0,0,0


In [31]:
df.dtypes

loan_amnt                         float64
int_rate                          float64
installment                       float64
sub_grade                           int64
emp_title                          object
emp_length                        float64
annual_inc                        float64
issue_d                    datetime64[ns]
desc                               object
title                              object
zip_code                           object
dti                               float64
delinq_2yrs                       float64
earliest_cr_line                    int64
fico_range_low                    float64
fico_range_high                   float64
inq_last_6mths                    float64
open_acc                          float64
revol_bal                         float64
revol_util                        float64
total_acc                         float64
acc_open_past_24mths              float64
mort_acc                          float64
percent_bc_gt_75                  

In [32]:
df.isnull().sum()

loan_amnt                       0
int_rate                        0
installment                     0
sub_grade                       0
emp_title                    3895
emp_length                      0
annual_inc                      0
issue_d                         0
desc                       102308
title                           6
zip_code                        0
dti                             0
delinq_2yrs                     0
earliest_cr_line                0
fico_range_low                  0
fico_range_high                 0
inq_last_6mths                  0
open_acc                        0
revol_bal                       0
revol_util                      0
total_acc                       0
acc_open_past_24mths            0
mort_acc                        0
percent_bc_gt_75                0
loan_status                     0
term_ 36 months                 0
term_ 60 months                 0
home_ownership_MORTGAGE         0
home_ownership_NONE             0
home_ownership

In [33]:
df.describe()

Unnamed: 0,loan_amnt,int_rate,installment,sub_grade,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
count,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,...,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0,180309.0
mean,14476.932654,14.286212,447.134435,23.247897,6.090532,73233.9,17.012586,0.24148,184.132107,696.823453,...,0.00218,0.010709,0.078105,0.008008,0.031074,0.001608,0.023948,0.01224,0.004692,0.002451
std,8121.452157,4.442508,242.805641,6.419024,3.565751,52378.76,7.578693,0.706113,82.139195,29.886771,...,0.046635,0.102931,0.268337,0.089131,0.173519,0.040072,0.152887,0.109956,0.068337,0.04945
min,1000.0,6.0,4.93,0.0,0.0,5000.0,0.0,0.0,36.0,660.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8000.0,11.14,273.92,19.0,3.0,45000.0,11.3,0.0,129.0,675.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12500.0,14.09,399.97,24.0,6.0,63059.4,16.73,0.0,169.0,690.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,20000.0,17.27,581.18,28.0,10.0,88033.0,22.5,0.0,225.0,710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,35000.0,26.06,1408.13,34.0,10.0,7141778.0,34.99,29.0,750.0,845.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
df.to_csv('2012-2013_cleaned.csv')

In [3]:
df=pd.read_csv('2012-2013_cleaned.csv', index_col=0)

In [8]:
corrdf=df.iloc[:,:22].corr()

In [10]:
corrdf[corrdf>0.4]

Unnamed: 0,loan_amnt,int_rate,installment,sub_grade,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,open_acc,revol_bal,revol_util,total_acc,acc_open_past_24mths,mort_acc,percent_bc_gt_75
loan_amnt,1.0,,0.95415,,,,,,,,,,,,,,,,
int_rate,,1.0,,,,,,,,,,,,,,,,,
installment,0.95415,,1.0,,,,,,,,,,,,,,,,
sub_grade,,,,1.0,,,,,,0.519093,0.519089,,,,,,,,
emp_length,,,,,1.0,,,,,,,,,,,,,,
annual_inc,,,,,,1.0,,,,,,,,,,,,,
dti,,,,,,,1.0,,,,,,,,,,,,
delinq_2yrs,,,,,,,,1.0,,,,,,,,,,,
earliest_cr_line,,,,,,,,,1.0,,,,,,,,,,
fico_range_low,,,,0.519093,,,,,,1.0,1.0,,,,,,,,
