In [44]:
import pandas as pd
import numpy as np
import os.path
import sqlite3
import re

In [45]:
# feature engineering in separate notebook

In [46]:
# we esimate 400 values of revol_util and 70k values of tot_coll_amt
# must compare prediction results later when we use or exclude these features

In [47]:
# select from sqlite database. function is specific to the file used in this project for simplicity

def select(query,path):
    
    conn = sqlite3.connect(path)
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

loans = select('SELECT * FROM LOAN',os.getcwd()+'\data\lending-club-loan-data\database.sqlite')

In [48]:
loans.head()

Unnamed: 0,index,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,...,,,,,,,,,,
1,1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,...,,,,,,,,,,
2,2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,...,,,,,,,,,,
3,3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,...,,,,,,,,,,
4,4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,...,,,,,,,,,,


In [49]:
# term has 1 NaN value. remove the datapoint - negligible reduction in total data 
# remove 'months' appended in term and convert to numerical (integers)

loans = loans[loans.term.notnull()]
loans.loc[:,'term'] = loans.term.apply(lambda x : int(re.findall(r"\D(\d{2})\D",x)[0]))

In [50]:
# remove % sign from int_rate and convert to float

loans.loc[:,'int_rate'] = loans.int_rate.apply(lambda x : float(x[:-1]))

In [51]:
# apply simple linear transformation on classes to numerical values. 
# **important note** this is altering the underlying data (interpreting it in a way that may not be completely justified)
# must be careful about conclusions arrived at from this column
# may want to leave this as categorical, or create a new column entirely with this transformation

grade_map = {'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1}
loans.grade = loans.grade.apply(lambda x: grade_map[x])

In [52]:
# drop sub_grade. sub_grades correspond to interest rates--redundant information.

loans = loans.drop('sub_grade',axis=1)

In [53]:
# create mapping from strings to employed length. replace 'n/a' with the mean of the data set. dropping would del 40k vectors
# mean is 6.009. round as 6 to save storage cost on float (20 is an intermediate filler value for the transformation)

keys = list(np.unique(loans['emp_length'].values))
values = [1,10,2,3,4,5,6,7,8,9,0,20]
mappings = dict(zip(keys,values))
loans.emp_length = loans.emp_length.apply(lambda x: mappings[x]).replace(20,6)

In [54]:
# annual_inc has 4 NaN values.
# !! make sure column is written to and read from SQL DB as int. otherwise do dtype conversion after processing

loans = loans[loans.annual_inc.notnull()]
loans.annual_inc = loans.annual_inc.astype(int)

In [55]:
# also check this column (dtype is properly preserved?) remove this step if not

loans.issue_d = pd.to_datetime(loans.issue_d)

In [56]:
# 25 null values

loans = loans[loans.delinq_2yrs.notnull()]
loans.delinq_2yrs = loans.delinq_2yrs.astype(int)

In [57]:
# do this after processing if datatype is not properly maintained between read/writes to SQL database

loans.inq_last_6mths = loans.inq_last_6mths.astype(int)

In [58]:
# going to leave NaN values alone. likely implies the loan never went delinquent?

loans['mths_since_last_delinq'].isnull().sum()

454284

In [59]:
# too many NaN values to be useful

loans = loans.drop('mths_since_last_record', axis=1)

In [60]:
loans.pub_rec = loans.pub_rec.astype(int)

In [61]:
loans.revol_bal = loans.revol_bal.astype(int)

In [62]:
# remove percentage sign and convert to float
# ~450 NaN values

def f(x):
    if x is not None:
        return x[:-1]
    else:
        return np.NaN

loans.revol_util = loans.revol_util.apply(f).astype(float)

In [63]:
loans.total_acc = loans.total_acc.astype(int)

In [64]:
# 17.6k null values..can't fill these values based on values of other loans.... drop for now. check at the end
# that we didn't lose too many data points for a single target output. i.e. hope the loss in data is evenly spread,
# in which case it represents about 2% loss of the dataset

loans = loans[loans.last_pymnt_d.notnull()]

In [65]:
# "completed" loans have no next pymnt day. remove this column for those

loans.next_pymnt_d.isnull().sum()

252445

In [66]:
# no null values for "in progress" loans

loans[(loans.loan_status!=('Fully Paid')) & (loans.loan_status!=('Charged Off'))].next_pymnt_d.isnull().sum()

0

In [67]:
# 51 null values

loans = loans[loans.last_credit_pull_d.notnull()]

In [68]:
# 115 null values

loans = loans[loans.collections_12_mths_ex_med.notnull()]

In [69]:
# NaN likely implies bad rating was never reached. leave values as null; decide later how to handle NaN

loans.mths_since_last_major_derog.isnull().sum()

652962

In [70]:
loans.policy_code = loans.policy_code.astype(int)

In [71]:
loans = loans.drop('annual_inc_joint',axis=1)
loans = loans.drop('dti_joint',axis=1)
loans = loans.drop('verification_status_joint',axis=1)

In [72]:
loans.acc_now_delinq = loans.acc_now_delinq.astype(int)

In [73]:
# FILLING 70K MISSING VALUES FOR THIS COLUMN

# loans = loans.drop('tot_coll_amt',axis=1)

In [74]:
# about 70k null values. description seems very similar to revol_bal; will not bother with filling values

loans = loans.drop('tot_cur_bal',axis=1)

In [75]:
# all have about 860k null values. cannot reasonably estimate

loans = loans.drop('open_acc_6m',axis=1)
loans = loans.drop('open_il_6m',axis=1)
loans = loans.drop('open_il_12m',axis=1)
loans = loans.drop('open_il_24m',axis=1)
loans = loans.drop('mths_since_rcnt_il',axis=1)
loans = loans.drop('total_bal_il',axis=1)
loans = loans.drop('il_util',axis=1)
loans = loans.drop('open_rv_12m',axis=1)
loans = loans.drop('open_rv_24m',axis=1)
loans = loans.drop('max_bal_bc',axis=1)
loans = loans.drop('all_util',axis=1)
loans = loans.drop('inq_fi',axis=1)
loans = loans.drop('total_cu_tl',axis=1)
loans = loans.drop('inq_last_12m',axis=1)

In [76]:
# not in data dictionary, about 70k null values. we will drop this since we don't know how to interpret the data anyway
# (in practice, leave it. this is for the sake of illustration within the project)

loans = loans.drop('total_rev_hi_lim',axis=1)

In [77]:
# index is unnecessary. df handles this

loans = loans.drop('index',axis=1)
loans = loans.reset_index()

In [78]:
# leaving database.sqlite untouched as the raw database provided by lending club
# alternatively, could write the table to the same SQLite db

# conn = sqlite3.connect(os.getcwd()+'\data\lending-club-loan-data\database2.sqlite')
# loans.to_sql('loan_clean',conn,index=False)
# conn.close()

In [79]:
loans_clean = select('SELECT * FROM LOAN_CLEAN',os.getcwd()+'\data\lending-club-loan-data\database2.sqlite')

In [80]:
# features left as NaN: 
# either unimportant features for modeling e.g. emp_title or have meaning in themselves as NaN e.g. mths_since_last_delinq

loans_clean.isnull().sum()

index                               0
id                                  0
member_id                           0
loan_amnt                           0
funded_amnt                         0
funded_amnt_inv                     0
term                                0
int_rate                            0
installment                         0
grade                               0
emp_title                       50193
emp_length                          0
home_ownership                      0
annual_inc                          0
verification_status                 0
issue_d                             0
loan_status                         0
pymnt_plan                          0
url                                 0
desc                           743762
purpose                             0
title                              28
zip_code                            0
addr_state                          0
dti                                 0
delinq_2yrs                         0
earliest_cr_

In [81]:
# seems the datetime conversion is not maintained through write/read to SQL db, but float to int is
# datetime conversion must be done in next reads (during feature engineering and analyses steps)

In [82]:
loans_clean.issue_d.head()

0    2011-12-01 00:00:00
1    2011-12-01 00:00:00
2    2011-12-01 00:00:00
3    2011-12-01 00:00:00
4    2011-12-01 00:00:00
Name: issue_d, dtype: object

In [83]:
loans_clean.inq_last_6mths.head()

0    1
1    5
2    2
3    1
4    0
Name: inq_last_6mths, dtype: int64