# Preprocessing Discrete Variables

In [1]:
# Standard Imports
import pandas as pd
import numpy as np

## Read the data

In [2]:
# Read in the data in pickle format
loan_data = pd.read_pickle('./dataset/loan_data_2014.pkl')

In [3]:
# Preview a sample of the data
loan_data.sample(6)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,emp_length_int,term_int,earliest_cr_line_date,mths_since_earliest_cr_line,issue_date_mths,mths_since_issue_date
125705,5770372,7202397,3000,3000,3000.0,36 months,20.49,112.25,E,E1,...,7500.0,,,,2.0,36,2001-11-01,193.0,54.0,54.0
367380,19196662,21399361,14700,14700,14700.0,36 months,12.99,495.24,C,C1,...,11700.0,,,,10.0,36,2004-07-01,161.0,42.0,42.0
91354,7046514,8708644,6600,6600,6600.0,36 months,12.35,220.32,B,B4,...,26100.0,,,,0.0,36,2004-10-01,158.0,51.0,51.0
166383,3344651,4146819,17625,17625,17625.0,60 months,16.29,431.33,C,C4,...,12200.0,,,,10.0,60,1994-08-01,280.0,58.0,58.0
228948,1101158,1337897,10000,10000,10000.0,36 months,16.29,353.01,D,D1,...,,,,,10.0,36,1991-12-01,312.0,71.0,71.0
378951,16492475,18584923,15000,15000,15000.0,60 months,14.99,356.78,C,C5,...,51400.0,,,,10.0,60,2000-12-01,204.0,42.0,42.0


### Preprocessing a few discrete variables

In [4]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466285 entries, 0 to 466284
Data columns (total 80 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   id                           466285 non-null  int64         
 1   member_id                    466285 non-null  int64         
 2   loan_amnt                    466285 non-null  int64         
 3   funded_amnt                  466285 non-null  int64         
 4   funded_amnt_inv              466285 non-null  float64       
 5   term                         466285 non-null  object        
 6   int_rate                     466285 non-null  float64       
 7   installment                  466285 non-null  float64       
 8   grade                        466285 non-null  object        
 9   sub_grade                    466285 non-null  object        
 10  emp_title                    438697 non-null  object        
 11  emp_length                

In [5]:
loan_data['grade']

0         B
1         C
2         C
3         C
4         B
         ..
466280    C
466281    D
466282    D
466283    A
466284    D
Name: grade, Length: 466285, dtype: object

In [6]:
# Create dummy variables for the grade column
pd.get_dummies(loan_data['grade'],prefix = 'grade', prefix_sep = ':')

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,grade:G
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
466280,0,0,1,0,0,0,0
466281,0,0,0,1,0,0,0
466282,0,0,0,1,0,0,0
466283,1,0,0,0,0,0,0


In [7]:
# Create dummy variables for other columns
loan_data_dummies = [pd.get_dummies(loan_data['grade'],prefix = 'grade', prefix_sep = ':'),
                    pd.get_dummies(loan_data['sub_grade'],prefix = 'sub_grade', prefix_sep = ':'),
                    pd.get_dummies(loan_data['home_ownership'],prefix = 'home_ownership', prefix_sep = ':'),
                    pd.get_dummies(loan_data['verification_status'],prefix = 'verification_status', prefix_sep = ':'),
                    pd.get_dummies(loan_data['loan_status'],prefix = 'loan_status', prefix_sep = ':'),
                    pd.get_dummies(loan_data['purpose'],prefix = 'purpose', prefix_sep = ':'),
                    pd.get_dummies(loan_data['addr_state'],prefix = 'addr_state', prefix_sep = ':'),
                    pd.get_dummies(loan_data['initial_list_status'],prefix = 'initial_list_status', prefix_sep = ':')]

In [8]:
loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)

In [9]:
# Check the type
type(loan_data_dummies)

pandas.core.frame.DataFrame

In [11]:
# Concat to original DataFrame
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)

In [13]:
# Check if they are appended to the real Dataframe
loan_data.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan',
       'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti',
       'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d',
       'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint