In [1]:
%matplotlib inline

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [5]:
from sklearn.feature_extraction.text import CountVectorizer  #DT does not take strings as input for the model fit step....

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [8]:
Bank_df=pd.read_excel("Bank_Personal_Loan_Modelling.xlsx",sheetname='Data')

In [9]:
Bank_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,5000.0,2500.5,1443.520003,1.0,1250.75,2500.5,3750.25,5000.0
Age,5000.0,45.3384,11.463166,23.0,35.0,45.0,55.0,67.0
Experience,5000.0,20.1046,11.467954,-3.0,10.0,20.0,30.0,43.0
Income,5000.0,73.7742,46.033729,8.0,39.0,64.0,98.0,224.0
ZIP Code,5000.0,93152.503,2121.852197,9307.0,91911.0,93437.0,94608.0,96651.0
Family,5000.0,2.3964,1.147663,1.0,1.0,2.0,3.0,4.0
CCAvg,5000.0,1.937913,1.747666,0.0,0.7,1.5,2.5,10.0
Education,5000.0,1.881,0.839869,1.0,1.0,2.0,3.0,3.0
Mortgage,5000.0,56.4988,101.713802,0.0,0.0,0.0,101.0,635.0
Personal Loan,5000.0,0.096,0.294621,0.0,0.0,0.0,0.0,1.0


In [10]:
Bank_df.head(10)  

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0
7,8,50,24,22,93943,1,0.3,3,0,0,0,0,0,1
8,9,35,10,81,90089,3,0.6,2,104,0,0,0,1,0
9,10,34,9,180,93023,1,8.9,3,0,1,0,0,0,0


In [11]:
Bank_df.shape

(5000, 14)

In [12]:
temp_df= Bank_df
Bank_df['Mortgage_Binary'] = Bank_df['Mortgage'].map(lambda x: 1 if x > 0 else 0)
Bank_df['Mortgage_Binary'].value_counts()
Bank_df=Bank_df.drop(['ID', 'ZIP Code','Mortgage'], axis=1)
Bank_df['Personal Loan']=Bank_df['Personal Loan'].astype('category')
Bank_df['Education']=Bank_df['Education'].astype('category')
Bank_df['Securities Account']=Bank_df['Securities Account'].astype('category')
Bank_df['CD Account']=Bank_df['CD Account'].astype('category')
Bank_df['Online']=Bank_df['Online'].astype('category')
Bank_df['CreditCard']=Bank_df['CreditCard'].astype('category')
Bank_df['Mortgage_Binary']=Bank_df['Mortgage_Binary'].astype('category')
Bank_df.dtypes

Age                      int64
Experience               int64
Income                   int64
Family                   int64
CCAvg                  float64
Education             category
Personal Loan         category
Securities Account    category
CD Account            category
Online                category
CreditCard            category
Mortgage_Binary       category
dtype: object

In [13]:
# splitting data into training and test set for independent attributes

#X = Bank_df[['Income','Family','CCAvg','Education','Securities Account','CD Account','Online','Mortgage_Binary']]
#y = Bank_df['Personal Loan']
train_set, test_set = train_test_split( Bank_df,test_size=0.30, random_state=42)

train_labels = train_set.pop("Personal Loan")
test_labels = test_set.pop("Personal Loan")
#train_labels

In [14]:

                                  
dt_model = DecisionTreeClassifier(criterion = 'entropy' )

In [15]:
dt_model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [16]:
dt_model.score(test_set , test_labels)

0.98733333333333329

In [17]:
test_pred = dt_model.predict(test_set)

In [18]:
from IPython.display import Image  
#import pydotplus as pydot
from sklearn import tree
from os import system
train_char_label = ['No', 'Yes']
PersonalLoanFile = open('PersonalLoan.dot','w')
dot_data = tree.export_graphviz(dt_model, out_file=PersonalLoanFile, feature_names = list(train_set), class_names = list(train_char_label))

PersonalLoanFile.close()


# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )

print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = train_set.columns))


                         Imp
Age                 0.020385
Experience          0.028055
Income              0.509143
Family              0.126492
CCAvg               0.115045
Education           0.187495
Securities Account  0.004358
CD Account          0.007951
Online              0.001076
CreditCard          0.000000
Mortgage_Binary     0.000000


In [19]:
print(classification_report(test_labels,test_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1343
          1       0.94      0.94      0.94       157

avg / total       0.99      0.99      0.99      1500



In [20]:
print(confusion_matrix(test_labels,test_pred))

[[1334    9]
 [  10  147]]


Regularising the Decision Tree

In [21]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', class_weight={0:.5,1:.5}, max_depth = 5, min_samples_leaf=5 )
dt_model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight={0: 0.5, 1: 0.5}, criterion='entropy',
            max_depth=5, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [22]:
personalloanlreg = open('personalloanregularized.dot','w')
dot_data = tree.export_graphviz(dt_model, out_file= personalloanlreg , feature_names = list(train_set), class_names = list(train_char_label))

personalloanlreg.close()

print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = train_set.columns))


                         Imp
Age                 0.000000
Experience          0.000000
Income              0.558399
Family              0.123010
CCAvg               0.105190
Education           0.204568
Securities Account  0.000000
CD Account          0.008834
Online              0.000000
CreditCard          0.000000
Mortgage_Binary     0.000000


In [23]:
test_pred = dt_model.predict(test_set)

In [24]:
dt_model.score(test_set , test_labels)

0.98799999999999999

In [25]:
print(classification_report(test_labels,test_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1343
          1       0.95      0.93      0.94       157

avg / total       0.99      0.99      0.99      1500



In [26]:

print(confusion_matrix(test_labels,test_pred))

[[1336    7]
 [  11  146]]
