In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cr = pd.read_csv("CreditRisk.csv")

In [3]:
cr.isnull().sum()   #gives all columns null

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
cr.Gender= cr.Gender.fillna('Male')
cr.Married= cr.Married.fillna('Yes')
cr.LoanAmount= cr.LoanAmount.fillna(cr.LoanAmount.median())
cr.Dependents= cr.Dependents.fillna(1)
cr.Self_Employed= cr.Self_Employed.fillna('No')
cr.Loan_Amount_Term= cr.Loan_Amount_Term.fillna(cr.Loan_Amount_Term.median())
cr.Credit_History= cr.Credit_History.fillna(0)

In [5]:
cr.isnull().sum()   #gives all columns null

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [6]:
cr = cr.iloc[:, 1:]
cr.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0.0,Graduate,No,5849,0.0,126.0,360.0,1.0,Urban,Y
1,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()   # I am creating object of class first time for running function
cr.Gender = le.fit_transform(cr.Gender)
cr.Married = le.fit_transform(cr.Married)
cr.Self_Employed = le.fit_transform(cr.Self_Employed)
cr.Education = le.fit_transform(cr.Education)
cr.Property_Area = le.fit_transform(cr.Property_Area)
cr.Loan_Status = le.fit_transform(cr.Loan_Status)

In [9]:
cr.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,0,0,5849,0.0,126.0,360.0,1.0,2,1
1,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [10]:
from sklearn.model_selection import train_test_split
train , test = train_test_split(cr, test_size = 0.2)
train_x = train.iloc[:, 0:-1]
train_y = train.iloc[:, -1]
test_x = test.iloc[:, 0:-1]
test_y = test.iloc[:, -1]

In [11]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [12]:
dt.fit(train_x, train_y)

DecisionTreeClassifier()

In [13]:
pred = dt.predict(test_x)

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(pred, test_y)

array([[ 20,  19],
       [ 30, 128]], dtype=int64)

In [15]:
tab = confusion_matrix(pred, test_y)
tab

array([[ 20,  19],
       [ 30, 128]], dtype=int64)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, pred)

0.751269035532995

In [17]:
from sklearn.metrics import precision_score
precision_score(test_y, pred)

0.810126582278481

In [18]:
dt.feature_importances_  # higher the score more important is the column

array([0.01013379, 0.02213756, 0.06155659, 0.01278425, 0.02199637,
       0.24167271, 0.12933441, 0.20303667, 0.03421962, 0.24554775,
       0.01758028])

In [19]:
len(dt.feature_importances_)

11

In [20]:
dt.feature_importances_.sum()

1.0000000000000002

In [21]:
feature_importace = pd.DataFrame({'feature': train_x.columns, 'importance': dt.feature_importances_})
feature_importace

Unnamed: 0,feature,importance
0,Gender,0.010134
1,Married,0.022138
2,Dependents,0.061557
3,Education,0.012784
4,Self_Employed,0.021996
5,ApplicantIncome,0.241673
6,CoapplicantIncome,0.129334
7,LoanAmount,0.203037
8,Loan_Amount_Term,0.03422
9,Credit_History,0.245548


In [22]:
feature_importace.sort_values('importance', ascending= False)

Unnamed: 0,feature,importance
9,Credit_History,0.245548
5,ApplicantIncome,0.241673
7,LoanAmount,0.203037
6,CoapplicantIncome,0.129334
2,Dependents,0.061557
8,Loan_Amount_Term,0.03422
1,Married,0.022138
4,Self_Employed,0.021996
10,Property_Area,0.01758
3,Education,0.012784


In [23]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_split=40, class_weight='balance')

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [25]:
rf.fit(train_x, train_y)

pred = rf.predict(test_x)

from sklearn.metrics import confusion_matrix
confusion_matrix(pred, test_y)

array([[ 22,  12],
       [ 28, 135]], dtype=int64)

In [28]:
 rf.feature_importances_

array([0.02313143, 0.02650014, 0.05206345, 0.02368208, 0.0171181 ,
       0.2199455 , 0.12094577, 0.19767612, 0.04654231, 0.22507345,
       0.04732165])

In [29]:
rf_feature_importace = pd.DataFrame({'feature': train_x.columns, 'importance': rf.feature_importances_})
rf_feature_importace

Unnamed: 0,feature,importance
0,Gender,0.023131
1,Married,0.0265
2,Dependents,0.052063
3,Education,0.023682
4,Self_Employed,0.017118
5,ApplicantIncome,0.219946
6,CoapplicantIncome,0.120946
7,LoanAmount,0.197676
8,Loan_Amount_Term,0.046542
9,Credit_History,0.225073


In [30]:
rf_feature_importace.sort_values('importance', ascending= False)

Unnamed: 0,feature,importance
9,Credit_History,0.225073
5,ApplicantIncome,0.219946
7,LoanAmount,0.197676
6,CoapplicantIncome,0.120946
2,Dependents,0.052063
10,Property_Area,0.047322
8,Loan_Amount_Term,0.046542
1,Married,0.0265
3,Education,0.023682
0,Gender,0.023131


In [None]:
rf = RandomForestClassifier(n_estimators=110, criterion='entropy', max_depth=7, class_weight='balance' )