# Data Wangling for Loan Datasets
Perform data wrangling for 2018 loan data collected from LendingClub. The datasets include four csv files containing loan data for each quarter in 2018. The datasets can be founded [here](https://github.com/nphan20181/Loan-Default-Prediction/tree/master/data).

Step-by-step data wrangling:
- [Step 1: Import Data](#Step-1:-Import-Data)
- [Step 2: Inspect Data](#Step-2:-Inspect-Data)
- [Step 3: Change Data Type](#Step-3:-Change-Data-Type)
- [Step 4: Handle Missing Values](#Step-4:-Handle-Missing-Values)
- [Step 5: Handle Outliers](#Step-5:-Handle-Outliers)
- [Step 6: Add New Columns](#Step-6:-Add-New-Columns)
- [Step 7: Change Object Column Type to Category](#Step-7:-Change-Object-Column-Type-to-Category)
- [Step 8: Export Data](#Step-8:-Export-Data)

In [1]:
import modules.global_vars as gv            # load user-defined variables
import glob
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 150)   # show max 150 columns

## Step 1: Import Data

In [2]:
csv_file = glob.glob('data/*.csv')    # get a list of csv files to be read
csv_file

['data\\LoanStats_2018Q1.csv',
 'data\\LoanStats_2018Q2.csv',
 'data\\LoanStats_2018Q3.csv',
 'data\\LoanStats_2018Q4.csv']

In [3]:
list_data = []
for filename in csv_file:
    # read csv file into data frame, skip first row which contains general note
    data = pd.read_csv(filename, skiprows=[0], low_memory=False)
    # drop columns that contain no values / duplicated / no needed
    data.drop(axis=1, columns=['id', 'member_id', 'url', 'desc', 'zip_code', 'funded_amnt', 
                               'emp_title', 'purpose'], inplace=True)
    # drop last 2 rows that contain the total amount funded in policy code 1 and 2
    data.drop(axis=0, index=data.iloc[-2:,:].index, inplace=True)
    list_data.append(data)

# combine into one data frame and drop column 'index'
df = pd.concat(list_data).reset_index().drop(axis=1, columns=['index'])

# keep only loans with status Fully Paid / Charged Off / Default
df = pd.concat([df.loc[df['loan_status'] == 'Fully Paid'],
          df.loc[df['loan_status'] == 'Charged Off'],
          df.loc[df['loan_status'] == 'Default']]).reset_index().drop(axis=1, columns=['index'])

## Step 2: Inspect Data
- [Date Columns](#Date-Columns)
- [Numerical Columns](#Numerical-Columns)
- [Non-numerical Columns](#Non-numerical-Columns)
- [Duplicates](#Duplicates)

In [4]:
# show number of rows and columns
df.shape

(93853, 136)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93853 entries, 0 to 93852
Columns: 136 entries, loan_amnt to settlement_term
dtypes: float64(103), object(33)
memory usage: 97.4+ MB


### Date Columns

In [6]:
date_cols = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 
             'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 
             'debt_settlement_flag_date', 'settlement_date']
df[date_cols].describe()

Unnamed: 0,issue_d,earliest_cr_line,last_pymnt_d,next_pymnt_d,last_credit_pull_d,hardship_start_date,hardship_end_date,payment_plan_start_date,debt_settlement_flag_date,settlement_date
count,93853,93853,93217,786,93851,114,114,114,1202,1202
unique,12,636,19,3,21,12,10,11,13,15
top,Jan-2018,Aug-2006,Feb-2019,Aug-2019,Jul-2019,Mar-2019,Apr-2019,Mar-2019,Jun-2019,May-2019
freq,11613,836,8964,783,38489,27,27,26,263,224


### Numerical Columns

In [7]:
num_cols_df = df.select_dtypes(include=["number"])     # numerical columns
num_cols_df.columns

Index(['loan_amnt', 'funded_amnt_inv', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc',
       ...
       'deferral_term', 'hardship_amount', 'hardship_length', 'hardship_dpd',
       'orig_projected_additional_accrued_interest',
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=103)

In [8]:
num_cols_df.head()

Unnamed: 0,loan_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
0,3000.0,3000.0,93.1,52000.0,0.58,0.0,0.0,26.0,,7.0,0.0,141.0,30.0,0.0,0.0,3011.577285,3011.58,3000.0,11.58,0.0,0.0,0.0,614.03,0.0,,1.0,,,0.0,0.0,150592.0,0.0,0.0,1.0,2.0,7.0,0.0,,0.0,1.0,141.0,1.0,31000.0,1.0,2.0,2.0,3.0,25099.0,30359.0,0.5,0.0,0.0,132.0,242.0,18.0,7.0,4.0,18.0,,7.0,,0.0,1.0,1.0,4.0,15.0,7.0,6.0,19.0,1.0,7.0,0.0,0.0,0.0,1.0,96.7,0.0,0.0,0.0,191216.0,141.0,30500.0,0.0,,,,,,,,,,,,,,,,,,,,
1,5000.0,5000.0,166.03,55000.0,14.18,0.0,0.0,74.0,82.0,14.0,1.0,11449.0,24.0,0.0,0.0,5013.306667,5013.31,5000.0,13.31,0.0,0.0,0.0,5019.97,0.0,74.0,1.0,,,0.0,0.0,28880.0,1.0,1.0,0.0,0.0,33.0,17431.0,63.0,2.0,2.0,4829.0,47.0,33800.0,0.0,1.0,1.0,2.0,2222.0,10551.0,52.0,0.0,0.0,77.0,199.0,3.0,3.0,0.0,3.0,,12.0,,1.0,3.0,3.0,4.0,7.0,6.0,13.0,18.0,3.0,14.0,0.0,0.0,0.0,2.0,95.7,33.3,1.0,0.0,61551.0,28880.0,22000.0,27751.0,,,,,,,,,,,,,,,,,,,,
2,7000.0,7000.0,232.44,40000.0,20.25,0.0,0.0,60.0,,13.0,0.0,5004.0,29.0,0.0,0.0,7693.314943,7693.31,7000.0,693.31,0.0,0.0,0.0,5364.25,0.0,60.0,1.0,,,0.0,0.0,131726.0,1.0,6.0,0.0,2.0,16.0,126722.0,102.0,2.0,2.0,3944.0,90.0,13900.0,2.0,1.0,4.0,4.0,10977.0,4996.0,50.0,0.0,0.0,122.0,132.0,1.0,1.0,0.0,10.0,64.0,5.0,60.0,3.0,2.0,2.0,3.0,4.0,19.0,7.0,10.0,2.0,13.0,0.0,0.0,0.0,2.0,89.7,33.3,0.0,0.0,132817.0,131726.0,10000.0,118917.0,,,,,,,,,,,,,,,,,,,,
3,10000.0,10000.0,326.92,52320.0,12.87,0.0,1.0,,97.0,6.0,1.0,1692.0,26.0,0.0,0.0,11036.679274,11036.68,10000.0,1036.68,0.0,0.0,0.0,7125.75,0.0,,1.0,,,0.0,0.0,74220.0,0.0,3.0,2.0,3.0,7.0,72528.0,113.0,1.0,1.0,1692.0,94.0,4400.0,2.0,1.0,4.0,4.0,12370.0,608.0,73.6,0.0,0.0,125.0,48.0,7.0,7.0,0.0,45.0,,0.0,,0.0,1.0,1.0,1.0,1.0,23.0,3.0,3.0,1.0,6.0,0.0,0.0,0.0,3.0,100.0,0.0,1.0,0.0,72124.0,74220.0,2300.0,67724.0,,,,,,,,,,,,,,,,,,,,
4,28000.0,28000.0,915.36,103000.0,30.76,0.0,0.0,73.0,,12.0,0.0,21266.0,30.0,0.0,0.0,30922.184697,30922.18,28000.0,2922.18,0.0,0.0,0.0,19929.38,0.0,73.0,1.0,,,0.0,0.0,76475.0,3.0,3.0,2.0,3.0,4.0,55209.0,91.0,1.0,3.0,10750.0,41.0,80300.0,0.0,0.0,0.0,6.0,6373.0,59034.0,26.5,0.0,0.0,134.0,184.0,2.0,2.0,1.0,2.0,,,,1.0,4.0,4.0,9.0,15.0,12.0,9.0,17.0,4.0,12.0,0.0,0.0,0.0,3.0,96.7,11.1,0.0,0.0,153679.0,76475.0,80300.0,73379.0,,,,,,,,,,,,,,,,,,,,


In [9]:
num_cols_df.describe()

Unnamed: 0,loan_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,93853.0,93853.0,93853.0,93853.0,93608.0,93853.0,93853.0,42283.0,14067.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,22997.0,93853.0,11884.0,11884.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,90727.0,93853.0,78660.0,93853.0,93853.0,93853.0,93825.0,93853.0,93853.0,93853.0,93853.0,93853.0,93843.0,92324.0,92256.0,93853.0,93853.0,90727.0,93853.0,93853.0,93853.0,93853.0,92430.0,19460.0,85724.0,27916.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,91306.0,93853.0,93853.0,93853.0,93852.0,92322.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,11884.0,11884.0,11884.0,11884.0,11667.0,11884.0,11884.0,11884.0,11884.0,4101.0,114.0,114.0,114.0,114.0,27.0,114.0,114.0,1202.0,1202.0,1202.0
mean,15013.870627,15010.414155,449.354976,79370.07,18.779895,0.235869,0.538928,36.955443,82.586834,11.412187,0.159739,14422.5,23.844288,125.130223,125.104645,13279.857748,13276.668482,12099.066341,1041.804696,1.067089,137.919634,24.481143,9226.026747,0.0174,46.220246,1.0,125562.0,18.932321,6.4e-05,325.8723,151175.7,1.055672,2.654119,0.846664,1.819132,18.150716,36092.37,70.944317,1.372178,2.880004,5318.037005,52.883901,37425.83,1.31869,1.713147,2.339499,5.035641,14763.01137,15898.812638,45.469279,0.008524,3.415213,121.242805,172.092805,13.670932,7.59313,1.495605,23.385935,40.723227,6.669334,37.939819,0.49975,3.324497,4.963304,4.75402,7.324145,8.750493,8.093902,13.393775,4.916337,11.386956,0.0,6.4e-05,0.063301,2.375928,94.694915,29.53681,0.144524,0.015066,191900.5,50737.44,25879.506526,45421.15,33006.718445,0.710451,1.673847,11.28551,55.071235,2.907523,12.738219,0.048889,0.077415,37.376981,3.0,189.372368,3.0,14.719298,600.062222,14709.654035,210.405789,7203.206165,50.737762,18.604825
std,10021.202609,10020.594091,290.944311,72081.05,20.558877,0.748662,0.790688,21.864641,23.66663,5.91401,0.444475,21260.69,12.628098,1600.009813,1599.734891,10692.0442,10690.961871,10241.071621,1131.678131,7.99586,710.72691,126.949127,9462.304789,0.14505,21.49535,0.0,68993.98,8.204615,0.007995,26608.1,169606.3,1.213321,2.883313,1.039662,1.7074,23.317944,45142.36,22.837397,1.573458,2.696749,5557.874003,21.976649,36379.19,1.676191,2.924269,2.600783,3.447436,18258.463651,20421.735126,29.82289,0.109494,325.721023,54.657124,97.398013,17.206229,8.40579,1.805258,31.943845,22.179328,5.713125,22.140755,1.412741,2.30111,3.275671,3.183376,4.684539,7.632749,4.924931,8.188911,3.189622,5.90662,0.0,0.007995,0.417321,2.028524,8.827529,34.225894,0.358137,0.2555,188988.0,52140.85,25451.954725,47175.23,28718.986542,1.061306,1.845999,6.533546,26.857467,3.124979,8.342257,0.423038,0.376314,24.167238,0.0,136.104738,0.0,8.496234,389.848291,8731.848087,213.438277,4871.309799,8.096342,6.83609
min,1000.0,725.0,30.12,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9000.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.95,3.0,0.0,149.82,424.11,0.07,307.0,29.92,1.0
25%,7000.0,7000.0,230.75,46500.0,10.57,0.0,0.0,19.0,68.0,7.0,0.0,4261.0,15.0,0.0,0.0,5087.824609,5084.37,4000.0,275.38,0.0,0.0,0.0,912.72,0.0,30.0,1.0,84500.0,12.84,0.0,0.0,27806.0,0.0,1.0,0.0,1.0,6.0,8560.0,58.0,0.0,1.0,1805.0,38.0,15800.0,0.0,0.0,1.0,3.0,3006.0,3119.0,19.9,0.0,0.0,84.0,105.0,4.0,3.0,0.0,6.0,23.0,2.0,20.0,0.0,2.0,3.0,3.0,4.0,4.0,5.0,8.0,3.0,7.0,0.0,0.0,0.0,1.0,92.3,0.0,0.0,0.0,54300.0,18815.0,9250.0,15000.0,14215.25,0.0,0.0,7.0,34.95,1.0,7.0,0.0,0.0,16.0,3.0,88.74,3.0,8.0,301.995,7625.8325,46.84,3611.15,45.0,16.0
50%,12000.0,12000.0,368.45,66000.0,16.88,0.0,0.0,35.0,86.0,10.0,0.0,9270.0,22.0,0.0,0.0,10453.269147,10448.5,10000.0,669.1,0.0,0.0,0.0,6519.63,0.0,46.0,1.0,112000.0,18.395,0.0,0.0,86513.0,1.0,2.0,1.0,1.0,11.0,23544.0,74.0,1.0,2.0,3954.0,54.0,27900.0,1.0,1.0,2.0,4.0,7976.0,8891.5,42.8,0.0,0.0,128.0,155.0,8.0,5.0,1.0,13.0,39.0,5.0,35.0,0.0,3.0,4.0,4.0,6.0,7.0,7.0,12.0,4.0,10.0,0.0,0.0,0.0,2.0,100.0,16.7,0.0,0.0,128472.0,36995.0,18500.0,33795.0,25636.5,0.0,1.0,10.0,57.2,2.0,11.0,0.0,0.0,36.0,3.0,143.805,3.0,16.0,380.4,13558.925,151.09,5871.84,45.02,20.0
75%,20000.0,20000.0,614.99,95000.0,24.17,0.0,1.0,54.0,101.0,14.0,0.0,17499.0,31.0,0.0,0.0,19121.947116,19110.52,18000.0,1403.16,0.0,0.0,0.0,14132.55,0.0,63.0,1.0,150000.0,24.53,0.0,0.0,231771.0,2.0,3.0,1.0,3.0,21.0,47243.0,87.0,2.0,4.0,7091.0,68.0,47500.0,2.0,2.0,3.0,7.0,20744.5,20700.0,70.1,0.0,0.0,153.0,220.0,17.0,10.0,2.0,28.0,57.0,10.0,54.0,0.0,4.0,6.0,6.0,10.0,12.0,10.0,17.0,6.0,14.0,0.0,0.0,0.0,3.0,100.0,50.0,0.0,0.0,281431.0,65053.0,34000.0,61631.0,42411.25,1.0,3.0,15.0,76.65,4.0,17.0,0.0,0.0,58.0,3.0,272.835,3.0,22.0,842.28,21190.94,311.46,9893.25,60.0,24.0
max,40000.0,40000.0,1670.15,9300000.0,999.0,19.0,5.0,226.0,127.0,86.0,52.0,1113293.0,148.0,39060.72,39060.72,53094.482179,53094.48,40000.0,13969.48,320.7,33122.07,5961.9726,41353.67,8.0,226.0,1.0,1837000.0,39.98,1.0,6214661.0,4535114.0,13.0,49.0,8.0,16.0,383.0,1378570.0,309.0,22.0,35.0,389468.0,175.0,1680300.0,32.0,48.0,67.0,36.0,513930.0,371701.0,158.6,9.0,65000.0,822.0,806.0,368.0,260.0,24.0,551.0,190.0,25.0,190.0,36.0,50.0,72.0,59.0,66.0,105.0,72.0,116.0,65.0,82.0,0.0,1.0,18.0,23.0,100.0,100.0,5.0,52.0,4819402.0,1569179.0,460900.0,1380346.0,384404.0,6.0,17.0,58.0,212.6,35.0,79.0,20.0,11.0,153.0,3.0,649.97,3.0,29.0,1369.86,32300.26,1072.99,28503.0,98.24,36.0


### Non-numerical Columns

In [10]:
non_num_cols = df.select_dtypes(exclude=["number"])    # non-numerical columns
non_num_cols = non_num_cols.drop(date_cols, axis=1)    # drop date columns
non_num_cols.columns 

Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_length',
       'home_ownership', 'verification_status', 'loan_status', 'pymnt_plan',
       'title', 'addr_state', 'revol_util', 'initial_list_status',
       'application_type', 'verification_status_joint',
       'sec_app_earliest_cr_line', 'hardship_flag', 'hardship_type',
       'hardship_reason', 'hardship_status', 'hardship_loan_status',
       'debt_settlement_flag', 'settlement_status'],
      dtype='object')

In [11]:
non_num_cols.head()

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,loan_status,pymnt_plan,title,addr_state,revol_util,initial_list_status,application_type,verification_status_joint,sec_app_earliest_cr_line,hardship_flag,hardship_type,hardship_reason,hardship_status,hardship_loan_status,debt_settlement_flag,settlement_status
0,36 months,7.34%,A,A4,9 years,RENT,Source Verified,Fully Paid,n,Major purchase,WA,0.5%,w,Individual,,,N,,,,,N,
1,36 months,11.98%,B,B5,10+ years,OWN,Not Verified,Fully Paid,n,Other,GA,33.9%,w,Individual,,,N,,,,,N,
2,36 months,11.98%,B,B5,< 1 year,MORTGAGE,Verified,Fully Paid,n,Home improvement,TX,36%,w,Individual,,,N,,,,,N,
3,36 months,10.90%,B,B4,< 1 year,RENT,Source Verified,Fully Paid,n,Debt consolidation,WA,38.5%,w,Individual,,,N,,,,,N,
4,36 months,10.90%,B,B4,6 years,MORTGAGE,Source Verified,Fully Paid,n,Debt consolidation,NC,26.5%,w,Individual,,,N,,,,,N,


In [12]:
non_num_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93853 entries, 0 to 93852
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   term                       93853 non-null  object
 1   int_rate                   93853 non-null  object
 2   grade                      93853 non-null  object
 3   sub_grade                  93853 non-null  object
 4   emp_length                 85734 non-null  object
 5   home_ownership             93853 non-null  object
 6   verification_status        93853 non-null  object
 7   loan_status                93853 non-null  object
 8   pymnt_plan                 93853 non-null  object
 9   title                      93853 non-null  object
 10  addr_state                 93853 non-null  object
 11  revol_util                 93723 non-null  object
 12  initial_list_status        93853 non-null  object
 13  application_type           93853 non-null  object
 14  verifi

### Duplicates
Check for duplicates.

In [13]:
duplicates = df[df.duplicated()]    # check for duplicates
duplicates

Unnamed: 0,loan_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,title,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term


## Step 3: Change Data Type

#### int_rate and revol_util
- Change field's type from object to float
- Show ratio instead of percentage

In [14]:
df['int_rate'].head()

0      7.34%
1     11.98%
2     11.98%
3     10.90%
4     10.90%
Name: int_rate, dtype: object

In [15]:
df['revol_util'].head()

0     0.5%
1    33.9%
2      36%
3    38.5%
4    26.5%
Name: revol_util, dtype: object

In [16]:
# convert int_rate from percentage to ratio and from object to float
df['int_rate'] = df['int_rate'].apply(lambda x: float(x.strip('%'))/100)
df['revol_util'] = df['revol_util'].apply(lambda x: float(str(x).strip('%'))/100)

## Step 4: Handle Missing Values
- [Columns with more than 25% of missing values](#Columns-with-more-than-25%-of-missing-values)
- [Columns with 25% or less of missing values](#Columns-with-25%-or-less-of-missing-values)

Get columns that has missing values.

In [17]:
# compute null/non-null percentage
def compute_percentage(counts,flag):
    pct = 0
    if flag in counts.index:
        pct = round((counts[flag]/df.shape[0])*100, 2)
    return pct   
    
# create a dictionary of columns that contain null vs. non-null percentage
cols_dict = {}
for col in list(df.columns):
    counts = df[col].isnull().value_counts()         # count null values
    false_pct = compute_percentage(counts, False)    # compute non-null percentage
    true_pct = compute_percentage(counts, True)      # compute null percentage
    
    if false_pct < 100:  # only add column's name to dictionary if has missing values
        cols_dict[col] = {'non_null_pct': false_pct, 'null_pct': true_pct}

# create a data frame of columns that have missing values along with null vs. non-null percentage
cols_df = pd.DataFrame(cols_dict).T
cols_df.head()

Unnamed: 0,non_null_pct,null_pct
emp_length,91.35,8.65
dti,99.74,0.26
mths_since_last_delinq,45.05,54.95
mths_since_last_record,14.99,85.01
revol_util,99.86,0.14


### Columns with more than 25% of missing values

In [18]:
# get a list of columns that have > 25% missing values
high_null_counts = cols_df[cols_df.null_pct > 25]
high_null_counts

Unnamed: 0,non_null_pct,null_pct
mths_since_last_delinq,45.05,54.95
mths_since_last_record,14.99,85.01
next_pymnt_d,0.84,99.16
mths_since_last_major_derog,24.5,75.5
annual_inc_joint,12.66,87.34
dti_joint,12.66,87.34
verification_status_joint,12.19,87.81
mths_since_recent_bc_dlq,20.73,79.27
mths_since_recent_revol_delinq,29.74,70.26
revol_bal_joint,12.66,87.34


In [19]:
# number of columns that have > 25% of missing values
high_null_counts.shape

(40, 2)

In [20]:
# drop columns that have > 25% missing values
df.drop(axis=1, columns=list(high_null_counts.index), inplace=True)
df.shape

(93853, 96)

### Columns with 25% or less of missing values
- [Non-numerical or Date Columns: Missing Values](#Non-numerical/Date-Columns:-Missing-Values)
- [Numerical Columns: Missing Values](#Numerical-Columns:-Missing-Values)

In [21]:
# get columns that have <= 25% missing values
low_null_counts = cols_df[cols_df.null_pct <= 25]
low_null_counts.shape

(15, 2)

In [22]:
low_null_counts

Unnamed: 0,non_null_pct,null_pct
emp_length,91.35,8.65
dti,99.74,0.26
revol_util,99.86,0.14
last_pymnt_d,99.32,0.68
mths_since_rcnt_il,96.67,3.33
il_util,83.81,16.19
all_util,99.97,0.03
avg_cur_bal,99.99,0.01
bc_open_to_buy,98.37,1.63
bc_util,98.3,1.7


##### Non-numerical or Date Columns: Missing Values
Leave the missing values as they are because the values may be missing for some reasons.

In [23]:
non_num_low_null_cols = list(df[list(low_null_counts.index)].select_dtypes(exclude=["number"]).columns)
low_null_counts.loc[non_num_low_null_cols]

Unnamed: 0,non_null_pct,null_pct
emp_length,91.35,8.65
last_pymnt_d,99.32,0.68


Loan status for observations with missing values:
- **last_pymnt_d**: when the loan is **charged off**, the last payment date is null.
- **next_pymnt_d**: when the loan is **fully paid** or **charge off**, the next payment date is null.

In [24]:
# view loan status for observation with missing value
for col in non_num_low_null_cols:
    print(col)
    print(df[df[col].isnull()]['loan_status'].value_counts())
    print('\n')

emp_length
Fully Paid     5912
Charged Off    2130
Default          77
Name: loan_status, dtype: int64


last_pymnt_d
Charged Off    636
Name: loan_status, dtype: int64




##### Numerical Columns: Missing Values

Numerical columns that have <= 25% missing values.

In [25]:
# get a list of numerical columns that have <= 25% missing values
num_low_null_cols = list(df[list(low_null_counts.index)].select_dtypes(include=["number"]).columns)
low_null_counts.loc[num_low_null_cols]

Unnamed: 0,non_null_pct,null_pct
dti,99.74,0.26
revol_util,99.86,0.14
mths_since_rcnt_il,96.67,3.33
il_util,83.81,16.19
all_util,99.97,0.03
avg_cur_bal,99.99,0.01
bc_open_to_buy,98.37,1.63
bc_util,98.3,1.7
mo_sin_old_il_acct,96.67,3.33
mths_since_recent_bc,98.48,1.52


Fill in missing values with the median.

In [26]:
for col in num_low_null_cols:
    df[col].fillna(df[col].median(), inplace=True)   # fill in missing values with the median

In [27]:
df[num_low_null_cols].describe()

Unnamed: 0,dti,revol_util,mths_since_rcnt_il,il_util,all_util,avg_cur_bal,bc_open_to_buy,bc_util,mo_sin_old_il_acct,mths_since_recent_bc,mths_since_recent_inq,num_tl_120dpd_2m,percent_bc_gt_75
count,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0,93853.0
mean,18.774935,0.400162,17.912544,71.438974,52.884234,14762.288217,15784.653463,45.423859,121.46787,23.228464,6.524746,0.0,29.327407
std,20.532254,0.254007,22.962199,20.937627,21.973379,18257.625297,20274.116998,29.570081,53.752839,31.726146,5.480251,0.0,33.98451
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,10.59,0.194,6.0,62.0,38.0,3006.0,3189.0,20.3,86.0,6.0,2.0,0.0,0.0
50%,16.88,0.372,11.0,74.0,54.0,7976.0,8891.5,42.8,128.0,13.0,5.0,0.0,16.7
75%,24.13,0.585,20.0,84.0,68.0,20743.0,20399.0,69.7,152.0,27.0,9.0,0.0,50.0
max,999.0,1.322,383.0,309.0,175.0,513930.0,371701.0,158.6,822.0,551.0,25.0,0.0,100.0


## Step 5: Handle Outliers

Compute z-score for each numerical columns.
If the column contains outliers, store the column's name and the outlier records

In [28]:
from scipy.stats import zscore
import numpy as np
x = np.seterr(divide='ignore', invalid='ignore')

# get a list of numerical columns
num_cols = list(df.select_dtypes(include=['number']).columns)

outlier_cols = []     # columns that have outliers
outliers_list = []

# get number of outliers for each numerical columns
for col in num_cols:
    z_score = np.abs(zscore(df[col]))
    if len(z_score) == 0:
        continue
    
    outliers = (z_score < -3) | (z_score > 3)  # construct a Boolean Series to identify outliers
    if(df[outliers].shape[0] > 0):             # if there are outliers
        outlier_cols.append(col)               # store column's name
        outliers_list.append(df[outliers])     # store data frame that contains outliers

# combine outlier data frames and drop duplicates
outliers_df = pd.concat(outliers_list).drop_duplicates()  

In [29]:
# number of rows and columns that contain extreme values
outliers_df[outlier_cols].shape

(39559, 71)

In [30]:
df.shape

(93853, 96)

In [31]:
# list of columns that contain outliers
print(outlier_cols)

['int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_r

## Step 6: Add New Columns

### Loan Status Flag
Categorize loan status into 2 categories:
- Fully Paid
- Default (Default, Charged Off)

In [32]:
df['loan_status'].value_counts()

Fully Paid     76264
Charged Off    16803
Default          786
Name: loan_status, dtype: int64

In [33]:
# add new column based on the value of loan_status
def set_loan_flag(status):
    for index, status_list in enumerate(gv.LOAN_STATUS):
        if status in status_list:
            return gv.LOAN_STATUS_FLAG[index]

df['loan_status_flag'] = df['loan_status'].apply(set_loan_flag)

In [34]:
df['loan_status_flag'].value_counts()

Fully Paid    76264
Default       17589
Name: loan_status_flag, dtype: int64

### earliest_cr_line_year
Extract the year from column *earliest_cr_line*

In [35]:
df['earliest_cr_line_year'] = df['earliest_cr_line'].apply(lambda x: x[4:]).astype('category')

In [36]:
df['earliest_cr_line_year'].head()

0    1998
1    2001
2    2007
3    2007
4    2002
Name: earliest_cr_line_year, dtype: category
Categories (63, object): [1950, 1951, 1953, 1954, ..., 2012, 2013, 2014, 2015]

In [37]:
df['earliest_cr_line_year'].describe()

count     93853
unique       63
top        2005
freq       7561
Name: earliest_cr_line_year, dtype: object

### earliest_cr_line_month
Extract the month from column *earliest_cr_line*

In [38]:
df['earliest_cr_line_month'] = df['earliest_cr_line'].apply(lambda x: x[:3])

In [39]:
df['earliest_cr_line_month'].head()

0    Jan
1    Aug
2    Mar
3    Oct
4    Mar
Name: earliest_cr_line_month, dtype: object

In [40]:
df['earliest_cr_line_month'].describe()

count     93853
unique       12
top         Aug
freq       9593
Name: earliest_cr_line_month, dtype: object

### yrs_since_earliest_cr_line
*Year(s) since earliest credit line* is the time difference between the loan issue date and the date the borrower opened his/her first credit line.

In [41]:
# years since earliest credit line
df['yrs_since_earliest_cr_line'] = (pd.to_datetime(df['issue_d']) - 
                                    pd.to_datetime(df['earliest_cr_line'])) / np.timedelta64(1,'Y')
df['yrs_since_earliest_cr_line'].head()

0    20.161947
1    16.580765
2    11.000910
3    10.414998
4    16.000329
Name: yrs_since_earliest_cr_line, dtype: float64

In [42]:
df['yrs_since_earliest_cr_line'].describe()

count    93853.000000
mean        15.850765
std          7.672699
min          3.077407
25%         11.083048
50%         14.253544
75%         19.499374
max         68.581833
Name: yrs_since_earliest_cr_line, dtype: float64

### last_pymnt_d_year
Extract the year from *last_pymnt_d*

In [43]:
df['last_pymnt_d_year'] = df['last_pymnt_d'].apply(lambda x: x[4:] if type(x) != float else np.NaN)

In [44]:
df['last_pymnt_d_year'].describe()

count     93217
unique        2
top        2019
freq      50475
Name: last_pymnt_d_year, dtype: object

In [45]:
df['last_pymnt_d_year'].value_counts()

2019    50475
2018    42742
Name: last_pymnt_d_year, dtype: int64

### last_pymnt_d_month
Extract the month from *last_pymnt_d*

In [46]:
df['last_pymnt_d_month'] = df['last_pymnt_d'].apply(lambda x: x[:3] if type(x) != float else 'Missing')

In [47]:
df['last_pymnt_d_month'].describe()

count     93853
unique       13
top         May
freq      10379
Name: last_pymnt_d_month, dtype: object

### last_credit_pull_d_year
Extract the year from *last_credit_pull_d*

In [48]:
df['last_credit_pull_d_year'] = df['last_credit_pull_d'].apply(lambda x: x[4:] if type(x) != float else np.NaN)

In [49]:
df['last_credit_pull_d_year'].describe()

count     93851
unique        3
top        2019
freq      80085
Name: last_credit_pull_d_year, dtype: object

In [50]:
df['last_credit_pull_d_year'].value_counts()

2019    80085
2018    13719
2017       47
Name: last_credit_pull_d_year, dtype: int64

### last_credit_pull_d_month
Extract the month from *last_credit_pull_d*

In [51]:
df['last_credit_pull_d_month'] = df['last_credit_pull_d'].apply(lambda x: x[:3] if type(x) != float else 'Missing')

In [52]:
df['last_credit_pull_d_month'].describe()

count     93853
unique       13
top         Jul
freq      39820
Name: last_credit_pull_d_month, dtype: object

In [53]:
df['last_credit_pull_d_month'].value_counts()

Jul        39820
Jun        11583
May         9453
Apr         7482
Mar         6463
Feb         4719
Jan         4174
Dec         2832
Nov         2521
Oct         2121
Sep         1403
Aug         1280
Missing        2
Name: last_credit_pull_d_month, dtype: int64

### issue_d_month
Extract the month from loan issue date, issue_d

In [54]:
df['issue_d_month'] = df['issue_d'].apply(lambda x: x[:3])

In [55]:
df['issue_d_month'].value_counts()

Jan    11613
Mar    10615
Apr    10583
May    10469
Feb     9613
Jun     8273
Jul     7762
Aug     7047
Oct     5245
Sep     5143
Nov     4125
Dec     3365
Name: issue_d_month, dtype: int64

## Step 7: Change Object Column Type to Category

In [56]:
excluded_cols = ['issue_d', 'earliest_cr_line', 'earliest_cr_line_year', 'last_pymnt_d', 
                 'last_credit_pull_d',  'last_credit_pull_d_year', 'last_pymnt_d_year', 'next_pymnt_d']

In [57]:
cat_cols = set(list(df.select_dtypes(exclude=['number']).columns)) - set(excluded_cols)

In [58]:
cat_cols

{'addr_state',
 'application_type',
 'debt_settlement_flag',
 'earliest_cr_line_month',
 'emp_length',
 'grade',
 'hardship_flag',
 'home_ownership',
 'initial_list_status',
 'issue_d_month',
 'last_credit_pull_d_month',
 'last_pymnt_d_month',
 'loan_status',
 'loan_status_flag',
 'pymnt_plan',
 'sub_grade',
 'term',
 'title',
 'verification_status'}

In [59]:
# convert to categories
for col in cat_cols:
    if col == 'emp_length':
        ordered_cat = pd.api.types.CategoricalDtype(categories=['< 1 year', '1 year', '2 years', '3 years', 
                                                                '4 years', '5 years', '6 years', '7 years', 
                                                                '8 years', '9 years', '10+ years'], ordered=True)
    elif col in ['issue_d_month', 'earliest_cr_line_month', 'last_pymnt_d_month', 'last_credit_pull_d_month']:
        ordered_cat = pd.api.types.CategoricalDtype(categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                                                'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                                                   ordered=True)
    elif col == 'loan_status_flag':
        ordered_cat = pd.api.types.CategoricalDtype(categories=gv.LOAN_STATUS_FLAG[:2], ordered=True)
    else:
        ordered_cat = pd.api.types.CategoricalDtype(categories=list(df[col].value_counts().sort_index().index),
                                                   ordered=True)
    df[col] = df[col].astype(ordered_cat)

## Step 8: Export Data

In [60]:
#df.to_csv('out/2018_LC_Loans_Cleaned.csv', index=False)   # csv file
df.to_pickle('out/2018_LC_Loans_Cleaned.pkl')             # pickle file