# Data Wangling for Loan Datasets
Perform data wrangling for 2018 loan data collected from LendingClub. The datasets include four csv files containing loan data for each quarter in 2018. The datasets can be founded [here](https://github.com/nphan20181/Loan-Default-Prediction/tree/master/data).

Step-by-step data wrangling:
- [Step 1: Import Data](#Step-1:-Import-Data)
- [Step 2: Inspect Data](#Step-2:-Inspect-Data)
- [Step 3: Change Data Type](#Step-3:-Change-Data-Type)
- [Step 4: Handle Missing Values](#Step-4:-Handle-Missing-Values)
- [Step 5: Handle Outliers](#Step-5:-Handle-Outliers)
- [Step 6: Export Data](#Step-6:-Export-Data)

## Step 1: Import Data

In [1]:
import glob

csv_file = glob.glob('data/*.csv')    # get a list of csv files to be read
csv_file

['data\\LoanStats_2018Q1.csv',
 'data\\LoanStats_2018Q2.csv',
 'data\\LoanStats_2018Q3.csv',
 'data\\LoanStats_2018Q4.csv']

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 150)

list_data = []
for filename in csv_file:
    # read csv file into data frame, skip first row which contains general note
    data = pd.read_csv(filename, skiprows=[0], low_memory=False)
    # drop columns that contain no values
    data.drop(axis=1, columns=['id', 'member_id', 'url', 'desc'], inplace=True)
    # drop last 2 rows that contain the total amount funded in policy code 1 and 2
    data.drop(axis=0, index=data.iloc[-2:,:].index, inplace=True)
    list_data.append(data)

# combine into one data frame and drop column 'index'
df = pd.concat(list_data).reset_index().drop(axis=1, columns=['index'])

## Step 2: Inspect Data
- [Numerical Columns](#Numerical-Columns)
- [Non-numerical Columns](#Non-numerical-Columns)
- [Duplicates](#Duplicates)

In [3]:
# show number of rows and columns
df.shape

(495242, 140)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495242 entries, 0 to 495241
Columns: 140 entries, loan_amnt to settlement_term
dtypes: float64(104), object(36)
memory usage: 529.0+ MB


### Numerical Columns

In [5]:
num_cols_df = df.select_dtypes(include=["number"])     # numerical columns
num_cols_df.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record',
       ...
       'deferral_term', 'hardship_amount', 'hardship_length', 'hardship_dpd',
       'orig_projected_additional_accrued_interest',
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=104)

In [6]:
num_cols_df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
0,10000.0,10000.0,10000.0,225.54,80000.0,7.59,0.0,1.0,27.0,,17.0,0.0,10710.0,23.0,8054.53,8054.53,3369.09,3369.09,1945.47,1423.62,0.0,0.0,0.0,225.54,0.0,27.0,1.0,,,0.0,0.0,299196.0,1.0,0.0,0.0,1.0,18.0,0.0,,6.0,12.0,1785.0,38.0,28400.0,2.0,0.0,4.0,13.0,17600.0,4193.0,66.5,0.0,0.0,133.0,137.0,4.0,4.0,2.0,4.0,27.0,4.0,27.0,2.0,10.0,14.0,10.0,11.0,3.0,16.0,18.0,14.0,17.0,0.0,0.0,0.0,6.0,91.3,40.0,0.0,0.0,388400.0,10710.0,12500.0,0.0,,,,,,,,,,,,,,,,,,,,
1,11200.0,11200.0,11200.0,367.82,44000.0,43.97,1.0,2.0,6.0,,8.0,0.0,1526.0,14.0,9752.03,9752.03,5478.98,5478.98,1447.97,4031.01,0.0,0.0,0.0,367.82,0.0,70.0,1.0,81000.0,31.94,0.0,0.0,67173.0,1.0,4.0,1.0,4.0,8.0,65647.0,89.0,1.0,1.0,1011.0,84.0,6200.0,8.0,1.0,10.0,5.0,8397.0,632.0,66.7,0.0,0.0,124.0,128.0,5.0,5.0,0.0,34.0,35.0,0.0,35.0,1.0,2.0,3.0,2.0,3.0,8.0,4.0,6.0,3.0,8.0,0.0,0.0,0.0,2.0,71.4,0.0,0.0,0.0,80367.0,67173.0,1900.0,74167.0,7101.0,3.0,1.0,14.0,80.0,11.0,8.0,0.0,2.0,37.0,,,,,,,,,,
2,6500.0,6500.0,6500.0,197.95,50000.0,8.66,0.0,0.0,58.0,,7.0,0.0,7871.0,16.0,3756.5,3756.5,3153.97,3153.97,2743.5,410.47,0.0,0.0,0.0,197.95,0.0,58.0,1.0,,,0.0,370.0,243513.0,0.0,1.0,0.0,0.0,32.0,794.0,12.0,0.0,1.0,5467.0,19.0,39000.0,0.0,0.0,0.0,2.0,34788.0,31129.0,20.2,0.0,0.0,158.0,174.0,21.0,21.0,2.0,21.0,,,,1.0,3.0,3.0,5.0,11.0,2.0,5.0,12.0,3.0,7.0,0.0,0.0,0.0,0.0,93.8,20.0,0.0,0.0,289008.0,8665.0,39000.0,6500.0,,,,,,,,,,,,,,,,,,,,
3,25000.0,25000.0,25000.0,688.35,65000.0,12.89,1.0,1.0,22.0,,7.0,0.0,8657.0,16.0,21019.97,21019.97,10264.56,10264.56,3980.03,6284.53,0.0,0.0,0.0,688.35,0.0,23.0,1.0,,,0.0,0.0,74795.0,0.0,2.0,0.0,2.0,16.0,8382.0,82.0,0.0,0.0,3237.0,90.0,8800.0,4.0,3.0,3.0,2.0,10685.0,63.0,98.1,0.0,0.0,69.0,126.0,72.0,16.0,2.0,126.0,,0.0,22.0,2.0,1.0,3.0,1.0,1.0,4.0,3.0,9.0,3.0,7.0,0.0,0.0,1.0,0.0,75.0,100.0,0.0,0.0,101234.0,17039.0,3300.0,10220.0,,,,,,,,,,,,,,,,,,,,
4,6000.0,6000.0,6000.0,194.77,46000.0,8.92,0.0,0.0,,,11.0,0.0,5566.0,12.0,3334.21,3334.21,3304.13,3304.13,2665.79,638.34,0.0,0.0,0.0,194.77,0.0,,1.0,,,0.0,0.0,17000.0,0.0,3.0,0.0,0.0,42.0,11434.0,87.0,1.0,4.0,3497.0,53.0,18800.0,0.0,1.0,1.0,4.0,1545.0,1703.0,67.3,0.0,0.0,85.0,94.0,10.0,10.0,0.0,10.0,,11.0,,0.0,1.0,3.0,2.0,2.0,4.0,8.0,8.0,3.0,11.0,0.0,0.0,0.0,1.0,100.0,50.0,0.0,0.0,31925.0,17000.0,5200.0,13125.0,,,,,,,,,,,,,,,,,,,,


In [7]:
num_cols_df.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,495242.0,495242.0,495242.0,495242.0,495242.0,494110.0,495242.0,495242.0,218590.0,62984.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,114833.0,495242.0,68985.0,68985.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,476832.0,495242.0,414418.0,495242.0,495242.0,495242.0,495113.0,495242.0,495242.0,495242.0,495242.0,495242.0,495202.0,488654.0,488439.0,495242.0,495242.0,476832.0,495242.0,495242.0,495242.0,495242.0,489044.0,98110.0,433937.0,142690.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,482838.0,495242.0,495242.0,495242.0,495240.0,488646.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,68985.0,68985.0,68985.0,68985.0,67788.0,68985.0,68985.0,68985.0,68985.0,22377.0,807.0,807.0,807.0,807.0,644.0,807.0,807.0,1323.0,1323.0,1323.0
mean,16025.020394,16025.020394,16021.669277,466.608863,80093.99,19.668887,0.229252,0.442192,36.889771,83.338689,11.491554,0.134514,16270.83,22.624151,10052.778169,10050.925371,7092.26512,7090.557323,5443.572847,1621.717003,0.838209,26.137063,4.639406,2169.116664,0.01768,46.319281,1.0,128341.9,19.357812,5.2e-05,214.3619,143959.8,0.898078,2.700815,0.672936,1.526753,21.10045,35323.69,68.003902,1.228062,2.623687,5862.635,54.087778,38345.38,1.086802,1.484854,1.937368,4.426622,13709.348688,15057.282179,49.867127,0.006823,1.792829,122.954602,174.638102,15.131693,8.718534,1.336308,25.552897,40.46717,7.488555,37.753115,0.467858,3.614485,5.361825,4.840203,7.092944,8.169146,8.163348,12.9185,5.324668,11.470647,0.0,4.6e-05,0.059932,2.030478,94.5809,32.891971,0.123489,0.010918,184941.7,51921.92,26679.46,45688.1,34757.05,0.607538,1.538189,11.486497,57.108528,2.998043,12.517707,0.041748,0.070943,37.326183,3.0,202.309579,3.0,13.895911,617.815901,15617.576109,221.043247,7164.330166,51.777793,18.367347
std,10138.075023,10138.075023,10137.900298,286.908951,88871.61,20.458244,0.743665,0.724613,21.777614,23.067742,5.947129,0.384829,22832.36,12.104004,8954.559746,8954.317018,6639.270539,6638.22645,6207.480925,1464.377079,8.088197,314.083166,56.090784,5406.25954,0.14657,21.603329,0.0,84323.31,8.089185,0.007245,11663.0,167489.9,1.121055,2.930219,0.927266,1.55093,25.774841,45478.43,23.798918,1.492159,2.568786,5913.583,21.027364,37341.24,1.502175,2.666036,2.360009,3.234548,17229.5515,19548.389919,29.005,0.093093,211.140366,55.854363,100.403761,18.750686,9.623258,1.708444,33.788958,22.244537,6.062303,21.971781,1.367011,2.423669,3.434093,3.181282,4.520551,7.353481,4.936628,7.896373,3.366916,5.941238,0.0,0.006815,0.420412,1.89003,9.092862,35.015405,0.335296,0.18274,188694.0,53396.67,25824.89,47972.06,29599.11,0.969398,1.766506,6.670732,25.83645,3.236424,8.211074,0.39047,0.393264,23.80332,0.0,145.02573,0.0,8.339066,428.247119,8958.017151,205.956305,4894.04444,8.47118,6.650417
min,1000.0,1000.0,725.0,29.76,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5693.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.95,3.0,0.0,31.47,424.11,0.07,307.0,29.92,1.0
25%,8000.0,8000.0,8000.0,254.545,46000.0,11.43,0.0,0.0,19.0,69.0,7.0,0.0,5304.0,14.0,2823.91,2822.74,2938.62,2937.6325,1890.59,582.75,0.0,0.0,0.0,279.25,0.0,29.0,1.0,85000.0,13.38,0.0,0.0,26885.0,0.0,1.0,0.0,0.0,7.0,7851.0,53.0,0.0,1.0,2183.0,40.0,16200.0,0.0,0.0,0.0,2.0,2904.0,2867.0,25.95,0.0,0.0,85.0,102.0,4.0,3.0,0.0,6.0,23.0,2.0,20.0,0.0,2.0,3.0,3.0,4.0,3.0,5.0,7.0,3.0,7.0,0.0,0.0,0.0,1.0,92.3,0.0,0.0,0.0,52059.0,19530.0,9800.0,14926.0,15477.0,0.0,0.0,7.0,38.3,1.0,7.0,0.0,0.0,17.0,3.0,89.81,3.0,7.0,281.82,8182.54,60.74,3468.5,45.0,16.0
50%,14000.0,14000.0,14000.0,386.82,66000.0,17.71,0.0,0.0,34.0,87.0,10.0,0.0,10832.0,21.0,8113.44,8113.44,5070.74,5069.14,3350.18,1170.3,0.0,0.0,0.0,465.68,0.0,46.0,1.0,114000.0,18.9,0.0,0.0,74550.5,1.0,2.0,0.0,1.0,13.0,22452.0,71.0,1.0,2.0,4441.0,55.0,28700.0,1.0,0.0,1.0,4.0,7033.0,8301.0,48.8,0.0,0.0,129.0,156.0,9.0,6.0,1.0,15.0,38.0,6.0,35.0,0.0,3.0,5.0,4.0,6.0,6.0,7.0,11.0,5.0,10.0,0.0,0.0,0.0,2.0,100.0,25.0,0.0,0.0,116292.5,37556.0,19200.0,33754.0,27350.0,0.0,1.0,10.0,59.1,2.0,11.0,0.0,0.0,36.0,3.0,170.47,3.0,14.0,532.02,13853.18,163.64,5811.0,50.0,18.0
75%,22000.0,22000.0,22000.0,629.04,96000.0,25.03,0.0,1.0,53.0,102.0,14.0,0.0,19867.0,29.0,15463.66,15463.66,8838.35,8837.23,6262.32,2199.0025,0.0,0.0,0.0,837.67,0.0,64.0,1.0,152000.0,24.96,0.0,0.0,218044.5,1.0,3.0,1.0,2.0,25.0,45845.0,85.0,2.0,4.0,7772.0,69.0,48700.0,2.0,2.0,3.0,6.0,19031.0,19694.0,74.4,0.0,0.0,155.0,226.0,19.0,11.0,2.0,30.0,57.0,11.0,53.0,0.0,5.0,7.0,6.0,9.0,11.0,10.0,17.0,7.0,14.0,0.0,0.0,0.0,3.0,100.0,57.1,0.0,0.0,269291.5,66119.0,35200.0,61914.0,45215.0,1.0,2.0,15.0,77.8,4.0,17.0,0.0,0.0,57.0,3.0,279.065,3.0,21.0,843.285,22288.105,337.56,9817.5,60.0,24.0
max,40000.0,40000.0,40000.0,1670.15,9930475.0,999.0,58.0,5.0,226.0,127.0,101.0,52.0,2358150.0,160.0,39354.99,39354.99,53094.482179,53094.48,40000.0,16384.35,655.1,33122.07,5961.9726,41353.67,9.0,226.0,1.0,7874821.0,39.99,1.0,6214661.0,9971659.0,15.0,56.0,8.0,20.0,507.0,1837038.0,1000.0,26.0,50.0,1170668.0,239.0,2087500.0,38.0,68.0,67.0,54.0,623229.0,605996.0,201.6,9.0,65000.0,848.0,826.0,502.0,382.0,87.0,661.0,194.0,25.0,190.0,58.0,50.0,72.0,69.0,86.0,130.0,91.0,151.0,65.0,101.0,0.0,1.0,58.0,26.0,100.0,100.0,7.0,52.0,9999999.0,2622906.0,1569000.0,2118996.0,1110019.0,6.0,27.0,67.0,434.3,43.0,106.0,21.0,23.0,185.0,3.0,876.46,3.0,30.0,2535.66,40149.35,1159.62,28503.0,98.24,36.0


### Non-numerical Columns

In [8]:
non_num_cols = df.select_dtypes(exclude=["number"])    # non-numerical columns
non_num_cols.columns 

Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d',
       'next_pymnt_d', 'last_credit_pull_d', 'application_type',
       'verification_status_joint', 'sec_app_earliest_cr_line',
       'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status',
       'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',
       'hardship_loan_status', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date'],
      dtype='object')

In [9]:
non_num_cols.head()

Unnamed: 0,term,int_rate,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,last_pymnt_d,next_pymnt_d,last_credit_pull_d,application_type,verification_status_joint,sec_app_earliest_cr_line,hardship_flag,hardship_type,hardship_reason,hardship_status,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_loan_status,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date
0,60 months,12.61%,C,C1,Supervisor,4 years,MORTGAGE,Source Verified,Mar-2018,Current,n,home_improvement,Home improvement,306xx,GA,Oct-2006,37.7%,w,Jul-2019,Jul-2019,Jul-2019,Individual,,,N,,,,,,,,N,,,
1,60 months,30.79%,G,G1,Client services,< 1 year,RENT,Not Verified,Mar-2018,Current,n,medical,Medical expenses,030xx,NH,Jul-2007,24.6%,w,Jul-2019,Jul-2019,Jul-2019,Joint App,Not Verified,Feb-2005,N,,,,,,,,N,,,
2,36 months,6.07%,A,A2,dental assistant,10+ years,MORTGAGE,Not Verified,Mar-2018,Current,n,debt_consolidation,Debt consolidation,970xx,OR,Sep-2003,20.2%,w,Jul-2019,Aug-2019,Jul-2019,Individual,,,N,,,,,,,,N,,,
3,60 months,21.85%,D,D5,Asphalt Supervisor,10+ years,MORTGAGE,Source Verified,Mar-2018,Current,n,debt_consolidation,Debt consolidation,361xx,AL,Mar-1995,98.4%,w,Jul-2019,Jul-2019,Jul-2019,Individual,,,N,,,,,,,,N,,,
4,36 months,10.41%,B,B3,Dental Hygienist,1 year,RENT,Not Verified,Mar-2018,Current,n,credit_card,Credit card refinancing,156xx,PA,May-2010,29.6%,w,Jul-2019,Jul-2019,Jul-2019,Individual,,,N,,,,,,,,N,,,


In [10]:
non_num_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495242 entries, 0 to 495241
Data columns (total 36 columns):
term                         495242 non-null object
int_rate                     495242 non-null object
grade                        495242 non-null object
sub_grade                    495242 non-null object
emp_title                    440583 non-null object
emp_length                   453255 non-null object
home_ownership               495242 non-null object
verification_status          495242 non-null object
issue_d                      495242 non-null object
loan_status                  495242 non-null object
pymnt_plan                   495242 non-null object
purpose                      495242 non-null object
title                        495242 non-null object
zip_code                     495242 non-null object
addr_state                   495242 non-null object
earliest_cr_line             495242 non-null object
revol_util                   494650 non-null object
initi

### Duplicates
Check for duplicates.

In [11]:
duplicates = df[df.duplicated()]    # check for duplicates
duplicates

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term


## Step 3: Change Data Type

#### int_rate
- Change field's type from object to float
- Show ratio instead of percentage

In [12]:
df['int_rate'].head()

0     12.61%
1     30.79%
2      6.07%
3     21.85%
4     10.41%
Name: int_rate, dtype: object

In [13]:
# convert int_rate from percentage to ratio and from object to float
df['int_rate'] = df['int_rate'].apply(lambda x: float(x.strip('%'))/100)

In [14]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,10000.0,10000.0,10000.0,60 months,0.1261,225.54,C,C1,Supervisor,4 years,MORTGAGE,80000.0,Source Verified,Mar-2018,Current,n,home_improvement,Home improvement,306xx,GA,7.59,0.0,Oct-2006,1.0,27.0,,17.0,0.0,10710.0,37.7%,23.0,w,8054.53,8054.53,3369.09,3369.09,1945.47,1423.62,0.0,0.0,0.0,Jul-2019,225.54,Jul-2019,Jul-2019,0.0,27.0,1.0,Individual,,,,0.0,0.0,299196.0,1.0,0.0,0.0,1.0,18.0,0.0,,6.0,12.0,1785.0,38.0,28400.0,2.0,0.0,4.0,13.0,17600.0,4193.0,66.5,0.0,0.0,133.0,137.0,4.0,4.0,2.0,4.0,27.0,4.0,27.0,2.0,10.0,14.0,10.0,11.0,3.0,16.0,18.0,14.0,17.0,0.0,0.0,0.0,6.0,91.3,40.0,0.0,0.0,388400.0,10710.0,12500.0,0.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
1,11200.0,11200.0,11200.0,60 months,0.3079,367.82,G,G1,Client services,< 1 year,RENT,44000.0,Not Verified,Mar-2018,Current,n,medical,Medical expenses,030xx,NH,43.97,1.0,Jul-2007,2.0,6.0,,8.0,0.0,1526.0,24.6%,14.0,w,9752.03,9752.03,5478.98,5478.98,1447.97,4031.01,0.0,0.0,0.0,Jul-2019,367.82,Jul-2019,Jul-2019,0.0,70.0,1.0,Joint App,81000.0,31.94,Not Verified,0.0,0.0,67173.0,1.0,4.0,1.0,4.0,8.0,65647.0,89.0,1.0,1.0,1011.0,84.0,6200.0,8.0,1.0,10.0,5.0,8397.0,632.0,66.7,0.0,0.0,124.0,128.0,5.0,5.0,0.0,34.0,35.0,0.0,35.0,1.0,2.0,3.0,2.0,3.0,8.0,4.0,6.0,3.0,8.0,0.0,0.0,0.0,2.0,71.4,0.0,0.0,0.0,80367.0,67173.0,1900.0,74167.0,7101.0,Feb-2005,3.0,1.0,14.0,80.0,11.0,8.0,0.0,2.0,37.0,N,,,,,,,,,,,,,,,N,,,,,,
2,6500.0,6500.0,6500.0,36 months,0.0607,197.95,A,A2,dental assistant,10+ years,MORTGAGE,50000.0,Not Verified,Mar-2018,Current,n,debt_consolidation,Debt consolidation,970xx,OR,8.66,0.0,Sep-2003,0.0,58.0,,7.0,0.0,7871.0,20.2%,16.0,w,3756.5,3756.5,3153.97,3153.97,2743.5,410.47,0.0,0.0,0.0,Jul-2019,197.95,Aug-2019,Jul-2019,0.0,58.0,1.0,Individual,,,,0.0,370.0,243513.0,0.0,1.0,0.0,0.0,32.0,794.0,12.0,0.0,1.0,5467.0,19.0,39000.0,0.0,0.0,0.0,2.0,34788.0,31129.0,20.2,0.0,0.0,158.0,174.0,21.0,21.0,2.0,21.0,,,,1.0,3.0,3.0,5.0,11.0,2.0,5.0,12.0,3.0,7.0,0.0,0.0,0.0,0.0,93.8,20.0,0.0,0.0,289008.0,8665.0,39000.0,6500.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
3,25000.0,25000.0,25000.0,60 months,0.2185,688.35,D,D5,Asphalt Supervisor,10+ years,MORTGAGE,65000.0,Source Verified,Mar-2018,Current,n,debt_consolidation,Debt consolidation,361xx,AL,12.89,1.0,Mar-1995,1.0,22.0,,7.0,0.0,8657.0,98.4%,16.0,w,21019.97,21019.97,10264.56,10264.56,3980.03,6284.53,0.0,0.0,0.0,Jul-2019,688.35,Jul-2019,Jul-2019,0.0,23.0,1.0,Individual,,,,0.0,0.0,74795.0,0.0,2.0,0.0,2.0,16.0,8382.0,82.0,0.0,0.0,3237.0,90.0,8800.0,4.0,3.0,3.0,2.0,10685.0,63.0,98.1,0.0,0.0,69.0,126.0,72.0,16.0,2.0,126.0,,0.0,22.0,2.0,1.0,3.0,1.0,1.0,4.0,3.0,9.0,3.0,7.0,0.0,0.0,1.0,0.0,75.0,100.0,0.0,0.0,101234.0,17039.0,3300.0,10220.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
4,6000.0,6000.0,6000.0,36 months,0.1041,194.77,B,B3,Dental Hygienist,1 year,RENT,46000.0,Not Verified,Mar-2018,Current,n,credit_card,Credit card refinancing,156xx,PA,8.92,0.0,May-2010,0.0,,,11.0,0.0,5566.0,29.6%,12.0,w,3334.21,3334.21,3304.13,3304.13,2665.79,638.34,0.0,0.0,0.0,Jul-2019,194.77,Jul-2019,Jul-2019,0.0,,1.0,Individual,,,,0.0,0.0,17000.0,0.0,3.0,0.0,0.0,42.0,11434.0,87.0,1.0,4.0,3497.0,53.0,18800.0,0.0,1.0,1.0,4.0,1545.0,1703.0,67.3,0.0,0.0,85.0,94.0,10.0,10.0,0.0,10.0,,11.0,,0.0,1.0,3.0,2.0,2.0,4.0,8.0,8.0,3.0,11.0,0.0,0.0,0.0,1.0,100.0,50.0,0.0,0.0,31925.0,17000.0,5200.0,13125.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,


## Step 4: Handle Missing Values
- [Columns with more than 25% of missing values](#Columns-with-more-than-25%-of-missing-values)
- [Columns with 25% or less of missing values](#Columns-with-25%-or-less-of-missing-values)

Get columns that has missing values.

In [15]:
# compute null/non-null percentage
def compute_percentage(counts,flag):
    pct = 0
    if flag in counts.index:
        pct = round((counts[flag]/df.shape[0])*100, 2)
    return pct   
    
# create a dictionary of columns that contain null vs. non-null percentage
cols_dict = {}
for col in list(df.columns):
    counts = df[col].isnull().value_counts()         # count null values
    false_pct = compute_percentage(counts, False)    # compute non-null percentage
    true_pct = compute_percentage(counts, True)      # compute null percentage
    
    if false_pct < 100:  # only add column's name to dictionary if has missing values
        cols_dict[col] = {'non_null_pct': false_pct, 'null_pct': true_pct}

# create a data frame of columns that have missing values along with null vs. non-null percentage
cols_df = pd.DataFrame(cols_dict).T
cols_df.head()

Unnamed: 0,non_null_pct,null_pct
emp_title,88.96,11.04
emp_length,91.52,8.48
dti,99.77,0.23
mths_since_last_delinq,44.14,55.86
mths_since_last_record,12.72,87.28


### Columns with more than 25% of missing values

In [16]:
# get a list of columns that have > 25% missing values
high_null_counts = cols_df[cols_df.null_pct > 25]
high_null_counts

Unnamed: 0,non_null_pct,null_pct
mths_since_last_delinq,44.14,55.86
mths_since_last_record,12.72,87.28
mths_since_last_major_derog,23.19,76.81
annual_inc_joint,13.93,86.07
dti_joint,13.93,86.07
verification_status_joint,12.93,87.07
mths_since_recent_bc_dlq,19.81,80.19
mths_since_recent_revol_delinq,28.81,71.19
revol_bal_joint,13.93,86.07
sec_app_earliest_cr_line,13.93,86.07


In [17]:
# number of columns that have > 25% of missing values
high_null_counts.shape

(39, 2)

In [18]:
# drop columns that have > 25% missing values
df.drop(axis=1, columns=list(high_null_counts.index), inplace=True)
df.shape

(495242, 101)

### Columns with 25% or less of missing values
- [Non-numerical Columns: Missing Values](#Non-numerical-Columns:-Missing-Values)
- [Numerical Columns: Missing Values](#Numerical-Columns:-Missing-Values)

In [19]:
# get columns that have <= 25% missing values
low_null_counts = cols_df[cols_df.null_pct <= 25]
low_null_counts.shape

(17, 2)

In [20]:
low_null_counts

Unnamed: 0,non_null_pct,null_pct
emp_title,88.96,11.04
emp_length,91.52,8.48
dti,99.77,0.23
revol_util,99.88,0.12
last_pymnt_d,99.87,0.13
next_pymnt_d,81.21,18.79
mths_since_rcnt_il,96.28,3.72
il_util,83.68,16.32
all_util,99.97,0.03
avg_cur_bal,99.99,0.01


##### Non-numerical Columns: Missing Values
Leave the missing values as they are because the values may be missing for some reasons.

In [21]:
non_num_low_null_cols = list(df[list(low_null_counts.index)].select_dtypes(exclude=["number"]).columns)
low_null_counts.loc[non_num_low_null_cols]

Unnamed: 0,non_null_pct,null_pct
emp_title,88.96,11.04
emp_length,91.52,8.48
revol_util,99.88,0.12
last_pymnt_d,99.87,0.13
next_pymnt_d,81.21,18.79


##### Numerical Columns: Missing Values

Numerical columns that have <= 25% missing values.

In [22]:
# get a list of numerical columns that have <= 25% missing values
num_low_null_cols = list(df[list(low_null_counts.index)].select_dtypes(include=["number"]).columns)
low_null_counts.loc[num_low_null_cols]

Unnamed: 0,non_null_pct,null_pct
dti,99.77,0.23
mths_since_rcnt_il,96.28,3.72
il_util,83.68,16.32
all_util,99.97,0.03
avg_cur_bal,99.99,0.01
bc_open_to_buy,98.67,1.33
bc_util,98.63,1.37
mo_sin_old_il_acct,96.28,3.72
mths_since_recent_bc,98.75,1.25
mths_since_recent_inq,87.62,12.38


Fill in missing values with the median.

In [23]:
for col in num_low_null_cols:
    df[col].fillna(df[col].median(), inplace=True)   # fill in missing values with the median

In [24]:
df[num_low_null_cols].describe()

Unnamed: 0,dti,mths_since_rcnt_il,il_util,all_util,avg_cur_bal,bc_open_to_buy,bc_util,mo_sin_old_il_acct,mths_since_recent_bc,mths_since_recent_inq,num_tl_120dpd_2m,percent_bc_gt_75
count,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0
mean,19.664409,20.799326,68.492868,54.088016,13708.809449,14967.406145,49.852468,123.179333,25.420827,7.30429,0.0,32.78686
std,20.435064,25.337617,21.798602,21.02463,17228.960152,19433.353925,28.805361,54.818304,33.597345,5.695827,0.0,34.793207
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.44,7.0,58.0,40.0,2904.0,2924.0,26.3,87.0,6.0,3.0,0.0,0.0
50%,17.71,13.0,71.0,55.0,7033.0,8301.0,48.8,129.0,15.0,6.0,0.0,25.0
75%,25.01,24.0,82.0,69.0,19029.75,19460.0,74.0,153.0,30.0,10.0,0.0,53.8
max,999.0,507.0,1000.0,239.0,623229.0,605996.0,201.6,848.0,661.0,25.0,0.0,100.0


## Step 5: Handle Outliers
- [Box Plots](#Box-Plots)
- [z-score](#z-score)

### Box Plots

In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets

# get a list of numerical columns
num_cols = sorted(list(df.select_dtypes(include=['number']).columns))

In [26]:
sns.set(rc={'figure.figsize':(8, 9)})

@ipywidgets.interact
def plot(Variable=num_cols): 
    (sns.boxplot(y=Variable, data=df))

interactive(children=(Dropdown(description='Variable', options=('acc_now_delinq', 'acc_open_past_24mths', 'all…

### z-score

In [27]:
from scipy.stats import zscore
import numpy as np
x = np.seterr(divide='ignore', invalid='ignore')

Compute z-score for each numerical columns.
If the column contains outliers, store the column's name and the outlier records

In [28]:
outlier_cols = []     # columns that have outliers
outliers_list = []

# get number of outliers for each numerical columns
for col in num_cols:
    z_score = np.abs(zscore(df[col]))
    if len(z_score) == 0:
        continue
    
    outliers = (z_score < -3) | (z_score > 3)  # construct a Boolean Series to identify outliers
    if(df[outliers].shape[0] > 0):             # if there are outliers
        outlier_cols.append(col)               # store column's name
        outliers_list.append(df[outliers])     # store data frame that contains outliers

# combine outlier data frames and drop duplicates
outliers_df = pd.concat(outliers_list).drop_duplicates()  

In [29]:
# number of rows and columns that contain extreme values
outliers_df[outlier_cols].shape

(195492, 71)

In [30]:
df.shape

(495242, 101)

In [31]:
# list of columns that contain outliers
print(outlier_cols)

['acc_now_delinq', 'acc_open_past_24mths', 'all_util', 'annual_inc', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'delinq_2yrs', 'delinq_amnt', 'dti', 'il_util', 'inq_fi', 'inq_last_12m', 'inq_last_6mths', 'installment', 'int_rate', 'last_pymnt_amnt', 'max_bal_bc', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_rcnt_il', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'open_acc', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'open_rv_12m', 'open_rv_24m', 'out_prncp', 'out_prncp_inv', 'pub_rec', 'pub_rec_bankruptcies', 'recoveries', 'revol_bal', 'tax_liens', 'tot_coll_amt', 'tot_cur_bal', 'tot_hi_

## Step 6: Export Data

In [32]:
df.to_csv('out/loan_stats_2018_cleaned.csv', index=False)