## Data Pre-Processing

### a. Load Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [2]:
ordinal_encoder = OrdinalEncoder()
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)
# define one hot encoding
#encoder = OneHotEncoder(drop='first', sparse=False)

# define imputer
#strategies = ['mean', 'median', 'most_frequent', 'constant']
imputer = SimpleImputer(strategy='mean')
# define imputer
imputer = KNNImputer()
# fit on the dataset
#imputer.fit(X)
# transform the dataset
#Xtrans = imputer.transform(X)

### b. Load Dataset

In [4]:
missing_values = ["n/a", "na", "--", "?"]
df = pd.read_csv('Data/lending_club_loan_two.csv',na_values = missing_values)

In [4]:
df.shape

(396030, 27)

In [5]:
print(df.dtypes)

loan_amnt               float64
term                     object
int_rate                float64
installment             float64
grade                    object
sub_grade                object
emp_title                object
emp_length               object
home_ownership           object
annual_inc              float64
verification_status      object
issue_d                  object
loan_status              object
purpose                  object
title                    object
dti                     float64
earliest_cr_line         object
open_acc                float64
pub_rec                 float64
revol_bal               float64
revol_util              float64
total_acc               float64
initial_list_status      object
application_type         object
mort_acc                float64
pub_rec_bankruptcies    float64
address                  object
dtype: object


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396030 entries, 0 to 396029
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             396030 non-null  float64
 1   term                  396030 non-null  object 
 2   int_rate              396030 non-null  float64
 3   installment           396030 non-null  float64
 4   grade                 396030 non-null  object 
 5   sub_grade             396030 non-null  object 
 6   emp_title             373100 non-null  object 
 7   emp_length            377729 non-null  object 
 8   home_ownership        396030 non-null  object 
 9   annual_inc            396030 non-null  float64
 10  verification_status   396030 non-null  object 
 11  issue_d               396030 non-null  object 
 12  loan_status           396030 non-null  object 
 13  purpose               396030 non-null  object 
 14  title                 394275 non-null  object 
 15  

### c. Remove Duplicate Data

In [7]:
# calculate duplicates
dups = df.duplicated()
#print(dups)
# report if there are any duplicates
print(dups.any())
# list all duplicate rows
print(df[dups])

False
Empty DataFrame
Columns: [loan_amnt, term, int_rate, installment, grade, sub_grade, emp_title, emp_length, home_ownership, annual_inc, verification_status, issue_d, loan_status, purpose, title, dti, earliest_cr_line, open_acc, pub_rec, revol_bal, revol_util, total_acc, initial_list_status, application_type, mort_acc, pub_rec_bankruptcies, address]
Index: []

[0 rows x 27 columns]


In [8]:
# delete duplicate rows
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(396030, 27)

### d. Identify and Remove column variables that only have a single value.

In [10]:
# summarize the number of unique values in each column
df.nunique().sort_values(ascending=True)

initial_list_status          2
term                         2
loan_status                  2
verification_status          3
application_type             3
home_ownership               6
grade                        7
pub_rec_bankruptcies         9
emp_length                  11
purpose                     14
pub_rec                     20
mort_acc                    33
sub_grade                   35
open_acc                    61
issue_d                    115
total_acc                  118
int_rate                   566
earliest_cr_line           684
revol_util                1226
loan_amnt                 1397
dti                       4262
annual_inc               27197
title                    48817
revol_bal                55622
installment              55706
emp_title               173104
address                 393700
dtype: int64

In [11]:
def RemoveSingleValueColumns(df):
    # get number of unique values for each column
    counts = df.nunique()
    # record columns to delete
    to_del = [i for i,v in enumerate(counts) if v == 1]
    # drop useless columns
    df.drop(to_del, axis=1, inplace=True)
    
    return df

In [12]:
print (df.shape)
df = RemoveSingleValueColumns(df)
print (df.shape)

(396030, 27)
(396030, 27)


### e. Missing Data Imputation

#### Handling Null Values 

In [13]:
null_columns = {}

all_columns = df.isnull().sum().sort_values(ascending=False)
for item in all_columns.index:
    if all_columns[item] > 0:
        null_columns[item] = 100* all_columns[item]/len(df)
        
null_columns

{'mort_acc': 9.54346892912153,
 'emp_title': 5.7899654066611115,
 'emp_length': 4.621114562028129,
 'title': 0.44314824634497385,
 'pub_rec_bankruptcies': 0.13509077595131683,
 'revol_util': 0.06969169002348306}

##### Handling Null Values of mort_acc column

In [14]:
print(df['mort_acc'].dtypes)

float64


In [15]:
print("Correlation with the mort_acc column")
df.corr()['mort_acc'].sort_values(ascending=False)

Correlation with the mort_acc column


mort_acc                1.000000
total_acc               0.381072
annual_inc              0.236320
loan_amnt               0.222315
revol_bal               0.194925
installment             0.193694
open_acc                0.109205
pub_rec_bankruptcies    0.027239
pub_rec                 0.011552
revol_util              0.007514
dti                    -0.025439
int_rate               -0.082583
Name: mort_acc, dtype: float64

In [16]:
total_acc_avg = df.groupby('total_acc').mean()['mort_acc']

In [17]:
total_acc_avg

total_acc
2.0      0.000000
3.0      0.052023
4.0      0.066743
5.0      0.103289
6.0      0.151293
           ...   
124.0    1.000000
129.0    1.000000
135.0    3.000000
150.0    2.000000
151.0    0.000000
Name: mort_acc, Length: 118, dtype: float64

In [18]:
def fill_mort_acc(total_acc,mort_acc):
    '''
    Accepts the total_acc and mort_acc values for the row.
    Checks if the mort_acc is NaN , if so, it returns the avg mort_acc value
    for the corresponding total_acc value for that row.
    
    total_acc_avg here should be a Series or dictionary containing the mapping of the
    groupby averages of mort_acc per total_acc values.
    '''
    if np.isnan(mort_acc):
        return total_acc_avg[total_acc]
    else:
        return mort_acc

In [19]:
df['mort_acc'] = df.apply(lambda x: fill_mort_acc(x['total_acc'], x['mort_acc']), axis=1)

##### Handling Null Values of emp_title column

In [20]:
print(df['emp_title'].dtypes)

object


In [21]:
df['emp_title']

0                        Marketing
1                  Credit analyst 
2                     Statistician
3                  Client Advocate
4          Destiny Management Inc.
                    ...           
396025            licensed bankere
396026                       Agent
396027                City Carrier
396028        Gracon Services, Inc
396029    Internal Revenue Service
Name: emp_title, Length: 396030, dtype: object

In [22]:
df['emp_title'].value_counts()

Teacher                         4389
Manager                         4250
Registered Nurse                1856
RN                              1846
Supervisor                      1830
                                ... 
iPlay America/Encore LLC           1
ADMINISTRATIVE SPECIALIST IV       1
sunchemical                        1
CITY CARRIERS                      1
smilemakers                        1
Name: emp_title, Length: 173104, dtype: int64

In [23]:
df = df.drop('emp_title',axis=1)

##### Handling Null Values of emp_length column

In [24]:
print(df['emp_length'].dtypes)

object


In [25]:
sorted(df['emp_length'].dropna().unique())

['1 year',
 '10+ years',
 '2 years',
 '3 years',
 '4 years',
 '5 years',
 '6 years',
 '7 years',
 '8 years',
 '9 years',
 '< 1 year']

In [26]:
df['emp_length'].value_counts()

10+ years    126041
2 years       35827
< 1 year      31725
3 years       31665
5 years       26495
1 year        25882
4 years       23952
6 years       20841
7 years       20819
8 years       19168
9 years       15314
Name: emp_length, dtype: int64

In [27]:
df['emp_length'] = df['emp_length'].fillna(value="1 year")

In [28]:
df['emp_length'] = ordinal_encoder.fit_transform(df[['emp_length']])

In [29]:
print("Correlation with the emp_length column")
df.corr()['emp_length'].sort_values(ascending=False)

Correlation with the emp_length column


emp_length              1.000000
int_rate               -0.000882
revol_util             -0.002059
open_acc               -0.009622
dti                    -0.011728
annual_inc             -0.012779
installment            -0.018400
loan_amnt              -0.020265
pub_rec                -0.023045
pub_rec_bankruptcies   -0.025572
revol_bal              -0.035898
total_acc              -0.054731
mort_acc               -0.101729
Name: emp_length, dtype: float64

In [30]:
df = df.drop('emp_length',axis=1)

##### Handling Null Values of title column

In [31]:
print(df['title'].dtypes)

object


In [32]:
df['title'].value_counts()

Debt consolidation                   152472
Credit card refinancing               51487
Home improvement                      15264
Other                                 12930
Debt Consolidation                    11608
                                      ...  
Fixing my Life                            1
Debt Consolidation CC & Dime Bank         1
Credit card and Home Repair               1
Paying off credit cards.                  1
Kickstarter                               1
Name: title, Length: 48817, dtype: int64

In [33]:
df = df.drop('title',axis=1)

##### Check Null Values 

In [34]:
df.isnull().sum().sort_values(ascending=False)

pub_rec_bankruptcies    535
revol_util              276
address                   0
loan_status               0
term                      0
int_rate                  0
installment               0
grade                     0
sub_grade                 0
home_ownership            0
annual_inc                0
verification_status       0
issue_d                   0
purpose                   0
dti                       0
earliest_cr_line          0
open_acc                  0
pub_rec                   0
revol_bal                 0
total_acc                 0
initial_list_status       0
application_type          0
mort_acc                  0
loan_amnt                 0
dtype: int64

In [35]:
# fit on the dataset
#imputer.fit(df)
# transform the dataset
#df = imputer.transform(df)

df = df.dropna()


#### Handling Missing Values (min = 0)  in numeric Column 

In [36]:
statistics = df.describe()
min_value_zero_columns = [item for item in statistics if statistics[item]['min'] == 0]
min_value_zero_columns

['annual_inc',
 'dti',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'mort_acc',
 'pub_rec_bankruptcies']

In [37]:
print("Correlation with the annual_inc column")
df.corr()['dti'].sort_values(ascending=False)

Correlation with the annual_inc column


dti                     1.000000
open_acc                0.135907
total_acc               0.101772
revol_util              0.088023
int_rate                0.078758
revol_bal               0.063461
loan_amnt               0.016338
installment             0.015511
pub_rec_bankruptcies   -0.014600
pub_rec                -0.017820
mort_acc               -0.018719
annual_inc             -0.081730
Name: dti, dtype: float64

In [38]:
df['dti'].value_counts()

14.40     307
19.20     302
16.80     301
18.00     297
20.40     295
         ... 
48.04       1
43.57       1
40.23       1
380.53      1
41.77       1
Name: dti, Length: 4262, dtype: int64

In [39]:
df.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies
count,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0,395219.0
mean,14122.06125,13.642094,432.06902,74199.37,17.390758,11.318494,0.178481,15851.65,53.808764,25.426505,1.787387,0.121722
std,8357.054944,4.472944,250.700153,61557.25,18.032696,5.134901,0.531098,20584.27,24.440088,11.885322,2.063743,0.356273
min,500.0,5.32,16.08,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0
25%,8000.0,10.49,250.33,45000.0,11.3,8.0,0.0,6038.0,35.9,17.0,0.0,0.0
50%,12000.0,13.33,375.49,64000.0,16.92,10.0,0.0,11190.0,54.8,24.0,1.0,0.0
75%,20000.0,16.55,567.79,90000.0,22.99,14.0,0.0,19626.0,72.9,32.0,3.0,0.0
max,40000.0,30.99,1533.81,8706582.0,9999.0,90.0,86.0,1743266.0,892.3,151.0,34.0,8.0


### f. Consider Numeric Columns that have few Values (Less than 1%)
1. For Ordinal variables apply Ordinal Encoding
2. For Categorical variables where no such ordinal relationship exists, apply One Hot Encoding

In [40]:
numeric_col = df.select_dtypes(include=['int64', 'float64']).columns

In [41]:
numeric_col

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'mort_acc',
       'pub_rec_bankruptcies'],
      dtype='object')

In [42]:
# summarize the number of unique values in each column
for col in numeric_col:
    num = len(np.unique(df[col]))
    percentage = float(num) / df.shape[0] * 100
    if percentage < 1:
        print (f"Column: {col} Unique Values: {num} Percentage: {percentage:.2f}")

Column: loan_amnt Unique Values: 1392 Percentage: 0.35
Column: int_rate Unique Values: 532 Percentage: 0.13
Column: open_acc Unique Values: 60 Percentage: 0.02
Column: pub_rec Unique Values: 20 Percentage: 0.01
Column: revol_util Unique Values: 1226 Percentage: 0.31
Column: total_acc Unique Values: 118 Percentage: 0.03
Column: mort_acc Unique Values: 110 Percentage: 0.03
Column: pub_rec_bankruptcies Unique Values: 9 Percentage: 0.00


In [43]:
df['pub_rec_bankruptcies'].value_counts()

0.0    350108
1.0     42786
2.0      1847
3.0       351
4.0        82
5.0        32
6.0         7
7.0         4
8.0         2
Name: pub_rec_bankruptcies, dtype: int64

In [44]:
df['pub_rec_bankruptcies'] = label_encoder.fit_transform(df['pub_rec_bankruptcies'])

In [45]:
df['revol_util'].value_counts()

0.00      2183
53.00      752
60.00      738
61.00      734
55.00      730
          ... 
10.08        1
46.74        1
121.00       1
117.70       1
118.20       1
Name: revol_util, Length: 1226, dtype: int64

In [46]:
df['revol_util']

0         41.8
1         53.3
2         92.2
3         21.5
4         69.8
          ... 
396025    34.3
396026    95.7
396027    66.9
396028    53.8
396029    91.3
Name: revol_util, Length: 395219, dtype: float64

### g. Handling Categorical Columns

##### Handling bool type Columns
1. Apply One Hot Encoding on bool type columns

In [47]:
bool_col = df.select_dtypes(['bool']).columns

In [48]:
bool_col

Index([], dtype='object')

In [49]:
#subgrade_dummies = pd.get_dummies(df['sub_grade'],drop_first=True)

In [50]:
#df = pd.concat([df.drop('sub_grade',axis=1),subgrade_dummies],axis=1)

#### Handling object type Columns
1.	For Ordinal variables apply Ordinal Encoding
2.	For Categorical variables where no such ordinal relationship exists, apply One Hot Encoding

In [51]:
object_col = df.select_dtypes(['object']).columns

In [52]:
object_col

Index(['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status',
       'issue_d', 'loan_status', 'purpose', 'earliest_cr_line',
       'initial_list_status', 'application_type', 'address'],
      dtype='object')

In [53]:
# summarize the number of unique values in each column
for col in object_col:
    num = len(np.unique(df[col]))
    percentage = float(num) / df.shape[0] * 100
    if percentage < 1:
        print (f"Column: {col} Unique Values: {num} Percentage: {percentage:.2f}")

Column: term Unique Values: 2 Percentage: 0.00
Column: grade Unique Values: 7 Percentage: 0.00
Column: sub_grade Unique Values: 35 Percentage: 0.01
Column: home_ownership Unique Values: 6 Percentage: 0.00
Column: verification_status Unique Values: 3 Percentage: 0.00
Column: issue_d Unique Values: 112 Percentage: 0.03
Column: loan_status Unique Values: 2 Percentage: 0.00
Column: purpose Unique Values: 14 Percentage: 0.00
Column: earliest_cr_line Unique Values: 684 Percentage: 0.17
Column: initial_list_status Unique Values: 2 Percentage: 0.00
Column: application_type Unique Values: 3 Percentage: 0.00


In [54]:
df['term'].value_counts()

 36 months    301247
 60 months     93972
Name: term, dtype: int64

In [58]:
df['term'] = label_encoder.fit_transform(df['term'])

In [59]:
#term_dummies = pd.get_dummies(df['term'],drop_first=True)

In [60]:
#term_dummies

In [61]:
#df = pd.concat([df.drop('term',axis=1),term_dummies],axis=1)

In [62]:
df = df.drop('grade',axis=1)

In [63]:
subgrade_dummies = pd.get_dummies(df['sub_grade'],drop_first=True)

In [64]:
df = pd.concat([df.drop('sub_grade',axis=1),subgrade_dummies],axis=1)

In [65]:
df['home_ownership'].value_counts()

MORTGAGE    198022
RENT        159395
OWN          37660
OTHER          110
NONE            29
ANY              3
Name: home_ownership, dtype: int64

In [66]:
df['home_ownership']=df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')

In [67]:
dummies = pd.get_dummies(df['home_ownership'],drop_first=True)
df = df.drop('home_ownership',axis=1)
df = pd.concat([df,dummies],axis=1)

In [68]:
dummies = pd.get_dummies(df[['verification_status', 'application_type','initial_list_status','purpose' ]],drop_first=True)
df = df.drop(['verification_status', 'application_type','initial_list_status','purpose'],axis=1)
df = pd.concat([df,dummies],axis=1)

In [69]:
df['issue_d'].value_counts()

Oct-2014    14838
Jul-2014    12597
Jan-2015    11701
Dec-2013    10609
Nov-2013    10492
            ...  
Jan-2008        3
Dec-2007        2
Aug-2007        1
Oct-2007        1
Nov-2007        1
Name: issue_d, Length: 112, dtype: int64

In [70]:
df['issue_d'] = pd.to_datetime(df['issue_d'])

In [71]:
df['issue_month'] = df['issue_d'].apply(lambda date:date.month)
df['issue_year'] = df['issue_d'].apply(lambda date:date.year)

In [72]:
df = df.drop('issue_d',axis=1)

In [73]:
df['loan_status'].value_counts()

Fully Paid     317696
Charged Off     77523
Name: loan_status, dtype: int64

In [74]:
#0    317696 (Loan has paid) [Majority/Negative Class]
#1     77523 (defaulter) [Minority/Positive Class]

In [75]:
df['loan_status'] = df['loan_status'].map({'Fully Paid':0,'Charged Off':1})

In [76]:
df['earliest_cr_line'].value_counts()

Oct-2000    3013
Aug-2000    2930
Oct-2001    2890
Aug-2001    2879
Nov-2000    2729
            ... 
Jul-1961       1
May-1955       1
Aug-1964       1
Jun-1959       1
Aug-1958       1
Name: earliest_cr_line, Length: 684, dtype: int64

In [77]:
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])

In [78]:
df['cr_month'] = df['earliest_cr_line'].apply(lambda date:date.month)
df['cr_year'] = df['earliest_cr_line'].apply(lambda date:date.year)

In [79]:
df = df.drop('earliest_cr_line',axis=1)

In [80]:
df['address']

0            0174 Michelle Gateway\r\nMendozaberg, OK 22690
1         1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113
2         87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113
3                   823 Reid Ford\r\nDelacruzside, MA 00813
4                    679 Luna Roads\r\nGreggshire, VA 11650
                                ...                        
396025     12951 Williams Crossing\r\nJohnnyville, DC 30723
396026    0114 Fowler Field Suite 028\r\nRachelborough, ...
396027    953 Matthew Points Suite 414\r\nReedfort, NY 7...
396028    7843 Blake Freeway Apt. 229\r\nNew Michael, FL...
396029        787 Michelle Causeway\r\nBriannaton, AR 48052
Name: address, Length: 395219, dtype: object

In [81]:
df['zip_code'] = df['address'].apply(lambda address:address[-5:])

In [82]:
dummies = pd.get_dummies(df['zip_code'],drop_first=True)
df = df.drop(['zip_code','address'],axis=1)
df = pd.concat([df,dummies],axis=1)

In [83]:
df.select_dtypes(['object']).columns

Index([], dtype='object')

### h. Save Data

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 395219 entries, 0 to 396029
Data columns (total 82 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            395219 non-null  float64
 1   term                                 395219 non-null  int32  
 2   int_rate                             395219 non-null  float64
 3   installment                          395219 non-null  float64
 4   annual_inc                           395219 non-null  float64
 5   loan_status                          395219 non-null  int64  
 6   dti                                  395219 non-null  float64
 7   open_acc                             395219 non-null  float64
 8   pub_rec                              395219 non-null  float64
 9   revol_bal                            395219 non-null  float64
 10  revol_util                           395219 non-null  float64
 11  total_acc    

In [88]:
df.shape

(395219, 82)

In [86]:
#saving the dataframe 
df.to_csv('../Data/classification_testData_shape(395219, 82).csv') 