In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
# Load the loan dataset
X_train = pd.read_csv("raw_data\X_train.csv")

In [3]:
print(X_train.columns)

Index(['index', 'acc_now_delinq', 'addr_state', 'annual_inc',
       'chargeoff_within_12_mths', 'collections_12_mths_ex_med', 'delinq_2yrs',
       'dti', 'earliest_cr_line', 'emp_length', 'fico_range_high',
       'fico_range_low', 'funded_amnt', 'home_ownership', 'inq_last_12m',
       'installment', 'int_rate', 'issue_d', 'loan_amnt', 'mort_acc',
       'mths_since_last_delinq', 'mths_since_recent_bc_dlq',
       'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl',
       'num_rev_accts', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd',
       'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'open_acc', 'open_il_24m',
       'open_rv_24m', 'percent_bc_gt_75', 'pub_rec', 'pub_rec_bankruptcies',
       'purpose', 'revol_util', 'tax_liens', 'term', 'title', 'total_acc',
       'verification_status', 'zip_code'],
      dtype='object')


In [4]:
X_train.head()

Unnamed: 0,index,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,...,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code
0,0,0.0,UT,80000.0,0.0,0.0,0.0,30.49,Oct-1996,4 years,...,1.0,1.0,credit_card,56.1%,0.0,36 months,Credit card refinancing,26.0,Source Verified,847xx
1,1,0.0,CA,82000.0,0.0,0.0,0.0,7.0,Mar-1993,10+ years,...,1.0,1.0,credit_card,88.3%,0.0,36 months,Credit card refinancing,11.0,Not Verified,900xx
2,2,0.0,NV,46080.0,0.0,0.0,1.0,17.32,Nov-2000,3 years,...,0.0,0.0,credit_card,18.1%,0.0,36 months,Credit card refinancing,31.0,Source Verified,895xx
3,3,0.0,AZ,30000.0,0.0,0.0,0.0,7.8,Jan-2010,< 1 year,...,0.0,0.0,car,33.4%,0.0,36 months,Car financing,24.0,Source Verified,853xx
4,4,0.0,OH,70000.0,0.0,0.0,0.0,13.36,Jan-2002,< 1 year,...,0.0,0.0,other,88.6%,0.0,60 months,Other,16.0,Not Verified,453xx


In [5]:
# find out which column in a pandas dataframe has two distinct values
def get_column_with_two_distinct_values(df: pd.DataFrame) -> str:
    """
    Returns the name of the column that has only two distinct values.
    """
    return df.columns[df.nunique() == 2][0]

In [6]:
print(get_column_with_two_distinct_values(X_train))

term


In [7]:
X_train['term'].nunique

<bound method IndexOpsMixin.nunique of 0           36 months
1           36 months
2           36 months
3           36 months
4           60 months
              ...    
1199856     36 months
1199857     36 months
1199858     60 months
1199859     60 months
1199860     36 months
Name: term, Length: 1199861, dtype: object>

In [5]:
num_cols = ['annual_inc', 'dti', 'fico_range_high', 'fico_range_low', 'loan_amnt',  'revol_util']

In [6]:
X_train[num_cols]

Unnamed: 0,annual_inc,dti,fico_range_high,fico_range_low,loan_amnt,revol_util
0,80000.0,30.49,689.0,685.0,16000.0,56.1%
1,82000.0,7.00,689.0,685.0,6600.0,88.3%
2,46080.0,17.32,674.0,670.0,10000.0,18.1%
3,30000.0,7.80,694.0,690.0,3000.0,33.4%
4,70000.0,13.36,674.0,670.0,20000.0,88.6%
...,...,...,...,...,...,...
1199856,41000.0,22.97,684.0,680.0,10000.0,82.1%
1199857,95000.0,25.63,709.0,705.0,17000.0,82.3%
1199858,61000.0,15.58,774.0,770.0,30000.0,6%
1199859,35000.0,24.07,719.0,715.0,17325.0,76.2%


In [7]:
if X_train.dtypes.all() == bool:
    print('The DataFrame is of boolean data type')

In [8]:
print('The data types of X_train are:\n', X_train.dtypes)

The data types of X_train are:
 index                           int64
acc_now_delinq                float64
addr_state                     object
annual_inc                    float64
chargeoff_within_12_mths      float64
collections_12_mths_ex_med    float64
delinq_2yrs                   float64
dti                           float64
earliest_cr_line               object
emp_length                     object
fico_range_high               float64
fico_range_low                float64
funded_amnt                   float64
home_ownership                 object
inq_last_12m                  float64
installment                   float64
int_rate                       object
issue_d                        object
loan_amnt                     float64
mort_acc                      float64
mths_since_last_delinq        float64
mths_since_recent_bc_dlq      float64
mths_since_recent_inq         float64
num_accts_ever_120_pd         float64
num_actv_bc_tl                float64
num_rev_accts     

In [10]:
# select only columns with numeric data types
numeric_cols = X_train.select_dtypes(include=['int64', 'float64'])
numeric_cols
# or 
# df.select_dtypes(include=[np.number])

Unnamed: 0,index,acc_now_delinq,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,fico_range_high,fico_range_low,funded_amnt,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,tax_liens,total_acc
0,0,0.0,80000.0,0.0,0.0,0.0,30.49,689.0,685.0,16000.0,...,0.0,3.0,14.0,2.0,3.0,100.0,1.0,1.0,0.0,26.0
1,1,0.0,82000.0,0.0,0.0,0.0,7.00,689.0,685.0,6600.0,...,0.0,0.0,5.0,0.0,1.0,100.0,1.0,1.0,0.0,11.0
2,2,0.0,46080.0,0.0,0.0,1.0,17.32,674.0,670.0,10000.0,...,0.0,0.0,7.0,0.0,3.0,0.0,0.0,0.0,0.0,31.0
3,3,0.0,30000.0,0.0,0.0,0.0,7.80,694.0,690.0,3000.0,...,0.0,4.0,19.0,,,0.0,0.0,0.0,0.0,24.0
4,4,0.0,70000.0,0.0,0.0,0.0,13.36,674.0,670.0,20000.0,...,0.0,0.0,4.0,3.0,1.0,100.0,0.0,0.0,0.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199856,1199856,0.0,41000.0,0.0,0.0,0.0,22.97,684.0,680.0,10000.0,...,,,5.0,,,100.0,0.0,0.0,0.0,8.0
1199857,1199857,0.0,95000.0,0.0,0.0,0.0,25.63,709.0,705.0,17000.0,...,0.0,0.0,18.0,,,83.3,0.0,0.0,0.0,29.0
1199858,1199858,0.0,61000.0,0.0,0.0,0.0,15.58,774.0,770.0,30000.0,...,0.0,4.0,23.0,,,0.0,0.0,0.0,0.0,26.0
1199859,1199859,0.0,35000.0,0.0,0.0,0.0,24.07,719.0,715.0,17325.0,...,,,26.0,,,33.3,0.0,0.0,0.0,52.0


In [13]:
non_numeric_cols = X_train.select_dtypes(exclude='number').columns
non_numeric_cols

Index(['addr_state', 'earliest_cr_line', 'emp_length', 'home_ownership',
       'int_rate', 'issue_d', 'purpose', 'revol_util', 'term', 'title',
       'verification_status', 'zip_code'],
      dtype='object')

In [14]:
non_numeric_cols.value_counts()

addr_state             1
earliest_cr_line       1
emp_length             1
home_ownership         1
int_rate               1
issue_d                1
purpose                1
revol_util             1
term                   1
title                  1
verification_status    1
zip_code               1
dtype: int64

In [15]:
X_train[non_numeric_cols]

Unnamed: 0,addr_state,earliest_cr_line,emp_length,home_ownership,int_rate,issue_d,purpose,revol_util,term,title,verification_status,zip_code
0,UT,Oct-1996,4 years,RENT,12.99%,Apr-2016,credit_card,56.1%,36 months,Credit card refinancing,Source Verified,847xx
1,CA,Mar-1993,10+ years,MORTGAGE,10.42%,Sep-2017,credit_card,88.3%,36 months,Credit card refinancing,Not Verified,900xx
2,NV,Nov-2000,3 years,MORTGAGE,8.99%,Aug-2016,credit_card,18.1%,36 months,Credit card refinancing,Source Verified,895xx
3,AZ,Jan-2010,< 1 year,OWN,11.53%,Aug-2015,car,33.4%,36 months,Car financing,Source Verified,853xx
4,OH,Jan-2002,< 1 year,RENT,26.30%,Sep-2017,other,88.6%,60 months,Other,Not Verified,453xx
...,...,...,...,...,...,...,...,...,...,...,...,...
1199856,IL,Feb-2002,4 years,RENT,15.31%,Jun-2012,debt_consolidation,82.1%,36 months,debt consolidation,Source Verified,604xx
1199857,NY,Jan-2002,3 years,RENT,6.03%,Nov-2013,credit_card,82.3%,36 months,Credit cards financing,Verified,106xx
1199858,MA,Nov-2002,8 years,OWN,11.99%,Dec-2015,home_improvement,6%,60 months,Home improvement,Source Verified,021xx
1199859,OH,Nov-1991,5 years,MORTGAGE,22.78%,Jun-2012,debt_consolidation,76.2%,60 months,Debt,Verified,454xx


In [16]:
#count unique values in non_numeric_cols each column
print(X_train[non_numeric_cols].nunique())

addr_state                51
earliest_cr_line         748
emp_length                11
home_ownership             6
int_rate                 669
issue_d                  160
purpose                   14
revol_util              1316
term                       2
title                  43858
verification_status        3
zip_code                 936
dtype: int64


In [17]:
#count unique values in non_numeric_cols each column
print(X_train.nunique())

index                         1199861
acc_now_delinq                      7
addr_state                         51
annual_inc                      59051
chargeoff_within_12_mths           11
collections_12_mths_ex_med         13
delinq_2yrs                        32
dti                              4747
earliest_cr_line                  748
emp_length                         11
fico_range_high                    38
fico_range_low                     38
funded_amnt                      1562
home_ownership                      6
inq_last_12m                       45
installment                     81217
int_rate                          669
issue_d                           160
loan_amnt                        1562
mort_acc                           39
mths_since_last_delinq            167
mths_since_recent_bc_dlq          168
mths_since_recent_inq              26
num_accts_ever_120_pd              43
num_actv_bc_tl                     36
num_rev_accts                     111
num_sats    

In [18]:
max_interest_rate = X_train['int_rate'].max()
max_interest_rate

' 30.99%'

In [19]:
max_interest_rate = max_interest_rate.replace(" ", "").replace("%", "")
max_interest_rate

'30.99'

In [40]:
X_train['mths_since_recent_bc_dlq'].max() #.nunique() #.unique() #

198.0