In [1]:
import pandas as pd
import numpy as np
import os.path
import sqlite3
import re

In [35]:
# feature engineering in separate notebook

In [2]:
# select from sqlite database. function is specific to the file used in this project for simplicity

def select(query):
    
    conn = sqlite3.connect(os.getcwd()+'\data\lending-club-loan-data\database.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

loans = select('SELECT * FROM LOAN')

In [3]:
loans.head()

Unnamed: 0,index,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,...,,,,,,,,,,
1,1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,...,,,,,,,,,,
2,2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,...,,,,,,,,,,
3,3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,...,,,,,,,,,,
4,4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,...,,,,,,,,,,


In [4]:
# term has 1 NaN value. remove the datapoint - negligible reduction in total data 
# remove 'months' appended in term and convert to numerical (integers)

loans = loans[loans.term.notnull()]
loans.loc[:,'term'] = loans.term.apply(lambda x : int(re.findall(r"\D(\d{2})\D",x)[0]))

In [5]:
# remove % sign from int_rate and convert to float

loans.loc[:,'int_rate'] = loans.int_rate.apply(lambda x : float(x[:-1]))

In [6]:
# apply simple linear transformation on classes to numerical values. 
# **important note** this is altering the underlying data (interpreting it in a way that may not be completely justified)
# must be careful about conclusions arrived at from this column

grade_map = {'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1}
loans.grade = loans.grade.apply(lambda x: grade_map[x])

In [7]:
# drop sub_grade. sub_grades correspond to interest rates--redundant information.

loans = loans.drop('sub_grade',axis=1)

In [None]:
# create mapping from strings to employed length. replace 'n/a' with the mean of the data set. dropping would del 40k vectors
# mean is 6.009. round as 6 to save storage cost on float (20 is an intermediate filler value for the transformation)

keys = list(np.unique(loans['emp_length'].values))
values = [1,10,2,3,4,5,6,7,8,9,0,20]
mappings = dict(zip(keys,values))
loans.emp_length = loans.emp_length.apply(lambda x: mappings[x]).replace(20,6)

In [None]:
# annual_inc has 4 NaN values.
# !! make sure column is written to and read from SQL DB as int. otherwise do dtype conversion after processing

loans = loans[loans.annual_inc.notnull()]
loans.annual_inc = loans.annual_inc.astype(int)

In [40]:
# also check this column (dtype is properly preserved?) remove this step if not

loans.issue_d = pd.to_datetime(loans.issue_d)

In [None]:
# 25 null values

loans = loans[loans.delinq_2yrs.notnull()]
loans.delinq_2yrs = loans.delinq_2yrs.astype(int)

In [8]:
loans.columns

Index(['index', 'id', 'member_id', 'loan_amnt', 'funded_amnt',
       'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url',
       'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti',
       'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', '