# Import Libraries

In [129]:
import numpy as np
import pandas as pd

# Import Data

In [130]:
loan_data_backup = pd.read_csv('./data/credit_train.csv')
loan_data = loan_data_backup.copy()

In [131]:
# Forces pandas to show all columns
pd.options.display.max_columns=None

In [132]:
loan_data.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


In [133]:
loan_data.tail()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
100509,,,,,,,,,,,,,,,,,,,
100510,,,,,,,,,,,,,,,,,,,
100511,,,,,,,,,,,,,,,,,,,
100512,,,,,,,,,,,,,,,,,,,
100513,,,,,,,,,,,,,,,,,,,


In [134]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100514 entries, 0 to 100513
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       100000 non-null  object 
 1   Customer ID                   100000 non-null  object 
 2   Loan Status                   100000 non-null  object 
 3   Current Loan Amount           100000 non-null  float64
 4   Term                          100000 non-null  object 
 5   Credit Score                  80846 non-null   float64
 6   Annual Income                 80846 non-null   float64
 7   Years in current job          95778 non-null   object 
 8   Home Ownership                100000 non-null  object 
 9   Purpose                       100000 non-null  object 
 10  Monthly Debt                  100000 non-null  float64
 11  Years of Credit History       100000 non-null  float64
 12  Months since last delinquent  46859 non-null

# General Preprocessing

In [135]:
# From row index 100,000 to the end, all rows and columns are just NaNs
# Let's get rid of these
null_indx = [i for i in range(100000, len(loan_data))]
loan_data.drop(index=null_indx, inplace=True)

## Preprocessing Few Continuous Variables

In [136]:
loan_data['Years in current job'].unique()

array(['8 years', '10+ years', '3 years', '5 years', '< 1 year',
       '2 years', '4 years', '9 years', '7 years', '1 year', nan,
       '6 years'], dtype=object)

In [137]:
# Clean data type and convert to int
loan_data['emp_length_int'] = loan_data['Years in current job'].str.replace('\+ years', '')

emp_length_dict = {
    '< 1 year': str(0),
    'n/a': str(0),
    ' years': '',
    ' year': ''
}

for key in emp_length_dict.keys():
    loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(key, emp_length_dict[key])

# For NaNs, impute 0 for no employee length
loan_data['emp_length_int'].fillna(value=str(0), inplace=True)

In [138]:
# Convert data types
loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])

In [139]:
loan_data['emp_length_int'].isna().sum()

0

In [140]:
loan_data['Term'].unique()

array(['Short Term', 'Long Term'], dtype=object)

In [141]:
loan_data['Term'].value_counts()

Short Term    72208
Long Term     27792
Name: Term, dtype: int64

In [142]:
# Encode the categorical
loan_data['is_short_term'] = np.where(loan_data['Term'] == 'Short Term', 1, 0)

In [143]:
loan_data.columns

Index(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term',
       'Credit Score', 'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens', 'emp_length_int',
       'is_short_term'],
      dtype='object')

In [145]:
loan_data['Years of Credit History'].describe()

count    100000.000000
mean         18.199141
std           7.015324
min           3.600000
25%          13.500000
50%          16.900000
75%          21.700000
max          70.500000
Name: Years of Credit History, dtype: float64

In [150]:
# Convert years of credit history to months
loan_data['mths_since_earliest_cr_line'] = loan_data['Years of Credit History'] * 12
loan_data['mths_since_earliest_cr_line'].describe()

count    100000.000000
mean        218.389692
std          84.183884
min          43.200000
25%         162.000000
50%         202.800000
75%         260.400000
max         846.000000
Name: mths_since_earliest_cr_line, dtype: float64

In [151]:
loan_data['mths_since_earliest_cr_line'].isna().sum()

0