In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('credit_risk_dataset.csv')

In [4]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [5]:
df.shape

(32581, 12)

In [6]:
df.dtypes

person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [8]:
df.describe(include=['object'])

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
count,32581,32581,32581,32581
unique,4,6,7,2
top,RENT,EDUCATION,A,N
freq,16446,6453,10777,26836


In [9]:
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [10]:
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)


In [11]:
df['loan_int_rate'] = df.groupby('loan_grade')['loan_int_rate'].transform(lambda x: x.fillna(x.median()))


In [12]:
df.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [13]:
Q1_income = df['person_income'].quantile(0.25)
Q3_income = df['person_income'].quantile(0.75)
IQR_income = Q3_income - Q1_income

filter = (df['person_income'] >= Q1_income - 1.5 * IQR_income) & (df['person_income'] <= Q3_income + 1.5 *IQR_income)
df = df.loc[filter]  

Q1_loan_amnt = df['loan_amnt'].quantile(0.25)
Q3_loan_amnt = df['loan_amnt'].quantile(0.75)
IQR_loan_amnt = Q3_loan_amnt - Q1_loan_amnt

filter = (df['loan_amnt'] >= Q1_loan_amnt - 1.5 * IQR_loan_amnt) & (df['loan_amnt'] <= Q3_loan_amnt + 1.5 * IQR_loan_amnt)
df = df.loc[filter]  


In [14]:
df.shape

(29736, 12)

In [15]:
df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], drop_first=True)

In [16]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
1,21,9600,5.0,1000,11.14,0,0.1,2,0,1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,1.0,5500,12.87,1,0.57,3,0,0,...,1,0,0,0,1,0,0,0,0,0
5,21,9900,2.0,2500,7.14,1,0.25,2,0,1,...,0,0,1,0,0,0,0,0,0,0
9,21,10000,6.0,1600,14.74,1,0.16,3,0,1,...,0,0,1,0,0,1,0,0,0,0
11,21,10000,2.0,4500,8.63,1,0.45,2,0,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix['loan_status'].sort_values(ascending=False)
print(correlation_with_target)

loan_status                    1.000000
loan_percent_income            0.370445
loan_int_rate                  0.326378
loan_grade_D                   0.320772
person_home_ownership_RENT     0.231451
loan_grade_E                   0.177946
cb_person_default_on_file_Y    0.176957
loan_grade_F                   0.094407
loan_amnt                      0.083743
loan_grade_G                   0.066041
loan_intent_MEDICAL            0.052835
loan_intent_HOMEIMPROVEMENT    0.042082
person_home_ownership_OTHER    0.013492
loan_grade_C                  -0.007360
cb_person_cred_hist_length    -0.015853
loan_intent_PERSONAL          -0.018311
person_age                    -0.022271
loan_intent_EDUCATION         -0.055692
loan_intent_VENTURE           -0.075804
loan_grade_B                  -0.083839
person_emp_length             -0.091602
person_home_ownership_OWN     -0.098238
person_income                 -0.280196
Name: loan_status, dtype: float64


In [22]:
# Debt-to-Income Ratio
df['DTI'] = df['loan_amnt'] / df['person_income']

# Age to Credit History Length Ratio
df['age_to_cred_hist_ratio'] = df['person_age'] / df['cb_person_cred_hist_length']

# Loan Amount to Age Ratio
df['loan_to_age_ratio'] = df['loan_amnt'] / df['person_age']

# Creating an interaction feature to capture combined effect of borrower's income and loan amount.
df['income_loan_interaction'] = df['person_income'] * df['loan_amnt']


In [23]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y,income_loan_interaction,DTI,age_to_cred_hist_ratio,loan_to_age_ratio
1,21,9600,5.0,1000,11.14,0,0.1,2,0,1,...,0,0,0,0,0,0,9600000,0.104167,10.5,47.619048
2,25,9600,1.0,5500,12.87,1,0.57,3,0,0,...,1,0,0,0,0,0,52800000,0.572917,8.333333,220.0
5,21,9900,2.0,2500,7.14,1,0.25,2,0,1,...,0,0,0,0,0,0,24750000,0.252525,10.5,119.047619
9,21,10000,6.0,1600,14.74,1,0.16,3,0,1,...,0,1,0,0,0,0,16000000,0.16,7.0,76.190476
11,21,10000,2.0,4500,8.63,1,0.45,2,0,1,...,0,0,0,0,0,0,45000000,0.45,10.5,214.285714
