# Preprocessing Discrete Variables

In [36]:
# Standard Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read the data

In [2]:
# Read in the data in pickle format
loan_data = pd.read_pickle('./dataset/loan_data_2014.pkl')

In [3]:
# Preview a sample of the data
loan_data.sample(6)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,emp_length_int,term_int,earliest_cr_line_date,mths_since_earliest_cr_line,issue_date_mths,mths_since_issue_date
361932,12468150,14480264,29450,29450,29450.0,36 months,10.15,952.35,B,B2,...,54800.0,,,,3.0,36,1993-10-01,290.0,41.0,41.0
275329,31517759,34110964,16000,16000,16000.0,36 months,10.15,517.41,B,B2,...,13700.0,,,,2.0,36,2001-09-01,195.0,38.0,38.0
349239,20752529,23024935,16000,16000,16000.0,60 months,14.49,376.37,C,C4,...,16800.0,,,,6.0,60,1999-04-01,224.0,41.0,41.0
365944,19165874,21368611,16000,16000,16000.0,36 months,6.49,490.32,A,A2,...,33800.0,,,,7.0,36,1994-09-01,279.0,42.0,42.0
396890,15539924,17632365,10000,10000,10000.0,36 months,14.16,342.56,C,C2,...,5600.0,,,,7.0,36,2005-08-01,148.0,43.0,43.0
219343,1253562,1495730,13800,13800,13775.0,36 months,7.62,430.03,A,A3,...,,,,,5.0,36,2006-08-01,136.0,67.0,67.0


### Preprocessing a few discrete variables

In [4]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466285 entries, 0 to 466284
Data columns (total 80 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   id                           466285 non-null  int64         
 1   member_id                    466285 non-null  int64         
 2   loan_amnt                    466285 non-null  int64         
 3   funded_amnt                  466285 non-null  int64         
 4   funded_amnt_inv              466285 non-null  float64       
 5   term                         466285 non-null  object        
 6   int_rate                     466285 non-null  float64       
 7   installment                  466285 non-null  float64       
 8   grade                        466285 non-null  object        
 9   sub_grade                    466285 non-null  object        
 10  emp_title                    438697 non-null  object        
 11  emp_length                

In [5]:
loan_data['grade']

0         B
1         C
2         C
3         C
4         B
         ..
466280    C
466281    D
466282    D
466283    A
466284    D
Name: grade, Length: 466285, dtype: object

In [6]:
# Create dummy variables for the grade column
pd.get_dummies(loan_data['grade'],prefix = 'grade', prefix_sep = ':')

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,grade:G
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
466280,0,0,1,0,0,0,0
466281,0,0,0,1,0,0,0
466282,0,0,0,1,0,0,0
466283,1,0,0,0,0,0,0


In [7]:
# Create dummy variables for other columns
loan_data_dummies = [pd.get_dummies(loan_data['grade'],prefix = 'grade', prefix_sep = ':'),
                    pd.get_dummies(loan_data['sub_grade'],prefix = 'sub_grade', prefix_sep = ':'),
                    pd.get_dummies(loan_data['home_ownership'],prefix = 'home_ownership', prefix_sep = ':'),
                    pd.get_dummies(loan_data['verification_status'],prefix = 'verification_status', prefix_sep = ':'),
                    pd.get_dummies(loan_data['loan_status'],prefix = 'loan_status', prefix_sep = ':'),
                    pd.get_dummies(loan_data['purpose'],prefix = 'purpose', prefix_sep = ':'),
                    pd.get_dummies(loan_data['addr_state'],prefix = 'addr_state', prefix_sep = ':'),
                    pd.get_dummies(loan_data['initial_list_status'],prefix = 'initial_list_status', prefix_sep = ':')]

In [8]:
loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)

In [9]:
# Check the type
type(loan_data_dummies)

pandas.core.frame.DataFrame

In [10]:
# Concat to original DataFrame
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)

In [11]:
# Check if they are appended to the real Dataframe
loan_data.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan',
       'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti',
       'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d',
       'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint

### Check for missing values and clean

In [14]:
pd.options.display.max_rows = None
loan_data.isna().sum()

id                                                                      0
member_id                                                               0
loan_amnt                                                               0
funded_amnt                                                             0
funded_amnt_inv                                                         0
term                                                                    0
int_rate                                                                0
installment                                                             0
grade                                                                   0
sub_grade                                                               0
emp_title                                                           27588
emp_length                                                          21008
home_ownership                                                          0
annual_inc                            

The following variables are needed for analysis have a couple of missing values: annual_inc, deling_2yrs, inq_last_6mths, open_acc, pub_rec, total_acc, acc_now_delinq, total_rev_hi_lim, emp_length_int, mths_since_earliest_cr_line. I will be treating these missing values in a different way from one another.

In [16]:
# Total revolving limit
# Assuming the total revolving limit is equal to the funded amount
loan_data['total_rev_hi_lim'].fillna(value = loan_data['funded_amnt'], inplace = True)

In [17]:
# Check the Total revolving limit
loan_data['total_rev_hi_lim'].isna().sum()

0

In [20]:
# Annual Income
loan_data['annual_inc'].fillna(value = loan_data['annual_inc'].mean(), inplace = True)

In [26]:
# The remaining variables: mths_since_earliest_cr_line, acc_now_delinq, total_acc, pub_rec, open_acc, 
# inq_last_6mths, delinq_2yrs, emp_length_int, I will be filling the missing values with 0
loan_data.update(loan_data[['mths_since_earliest_cr_line', 'acc_now_delinq', 'total_acc', 'pub_rec', 'open_acc', 'inq_last_6mths','delinq_2yrs', 'emp_length_int']].fillna(0))

In [27]:
loan_data.isna().sum()

id                                                                      0
member_id                                                               0
loan_amnt                                                               0
funded_amnt                                                             0
funded_amnt_inv                                                         0
term                                                                    0
int_rate                                                                0
installment                                                             0
grade                                                                   0
sub_grade                                                               0
emp_title                                                           27588
emp_length                                                          21008
home_ownership                                                          0
annual_inc                            

## PD model

### Dependent Variables. Good/Bad (Default) Definition. Default and Non Default Accounts

In [30]:
# Check the status of accounts which have defaulted or not
loan_data['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

In [31]:
# Check how many accounts there are for each status
loan_data['loan_status'].value_counts()

Current                                                224226
Fully Paid                                             184739
Charged Off                                             42475
Late (31-120 days)                                       6900
In Grace Period                                          3146
Does not meet the credit policy. Status:Fully Paid       1988
Late (16-30 days)                                        1218
Default                                                   832
Does not meet the credit policy. Status:Charged Off       761
Name: loan_status, dtype: int64

In [32]:
# Proportion of accounts by status
loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()

Current                                                0.480878
Fully Paid                                             0.396193
Charged Off                                            0.091092
Late (31-120 days)                                     0.014798
In Grace Period                                        0.006747
Does not meet the credit policy. Status:Fully Paid     0.004263
Late (16-30 days)                                      0.002612
Default                                                0.001784
Does not meet the credit policy. Status:Charged Off    0.001632
Name: loan_status, dtype: float64

In [33]:
# Create indicator for a good or bad indicator: Good = 1, Bad = 0
loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',
                                                               'Does not meet the credit policy. Status:Charged Off',
                                                               'Late (16-30 days)']), 0, 1)

In [35]:
# Check to see if the indicator was created
loan_data['good_bad'].head(6)

0    1
1    0
2    1
3    1
4    1
5    1
Name: good_bad, dtype: int64

## Splitting the Data

In [37]:
# Split the data into training and testing samples
loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], random_state = 42, test_size = 0.2 )

In [38]:
# Check the shape of loan data inputs train
loan_data_inputs_train.shape

(349713, 206)

In [39]:
# Check the shape of loan data targets train
loan_data_targets_train.shape

(349713,)

In [40]:
# Check the shape of loan data inputs test
loan_data_inputs_test.shape

(116572, 206)

In [41]:
# Check the shape of loan data targets test
loan_data_targets_test.shape

(116572,)