In [2]:
### Comparison of classification algorithms. Apply following algorithms on bank loan data set
### 1. Decision Tree
### 2. Random Forest
### 3. Support Vector machine
### 4. Logistic Regrssion
### 5. Naive Bayes

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import graphviz

In [7]:
#Read csv
bankdata=pd.read_csv('bank-additional-full.csv',delimiter=';')


In [8]:
#Explore columns
bankdata.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [9]:
#Explore data
bankdata.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [14]:
#Explore column values
array=bankdata['job'].unique()
array=bankdata['month'].unique()
print(array)


['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']


In [10]:
#Create feature set. 
featureSet=bankdata
#drop these column for now
featureSet=featureSet.drop(['education','month','day_of_week','duration','pdays',],axis=1)

#convert categorical values into numbers

contact_dummy=pd.get_dummies(bankdata['contact'],prefix='contact')
marital_dummy=pd.get_dummies(bankdata['marital'],prefix='marital')
job_dummy=pd.get_dummies(bankdata['job'],prefix='job')
default_dummy=pd.get_dummies(bankdata['default'],prefix='default')
housing_dummy=pd.get_dummies(bankdata['housing'],prefix='housing')
loan_dummy=pd.get_dummies(bankdata['loan'],prefix='loan')
poutcome_dummy=pd.get_dummies(bankdata['poutcome'],prefix='poutcome')
featureSet=pd.concat([featureSet,contact_dummy,marital_dummy,job_dummy,default_dummy,
                      housing_dummy,loan_dummy,poutcome_dummy],axis=1)

featureSet=featureSet.drop(['contact','marital','job','default','housing','loan','poutcome'],axis=1)
featureSet.columns

Index(['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'contact_cellular',
       'contact_telephone', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_unknown', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'default_no', 'default_unknown',
       'default_yes', 'housing_no', 'housing_unknown', 'housing_yes',
       'loan_no', 'loan_unknown', 'loan_yes', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [12]:
#create train , test data

X=featureSet
X=X.drop('y',axis=1)
Y=pd.DataFrame(featureSet['y'])
Y=pd.get_dummies(Y['y'],prefix='target')
Y=Y.drop('target_no',axis=1)

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.3, random_state=42)
print('X_train.shape=',X_train.shape,'X_test.shape=',X_test.shape,'y_train.shape=',y_train.shape,'y_test.shape=',y_test.shape)



X_train.shape= (28831, 38) X_test.shape= (12357, 38) y_train.shape= (28831, 1) y_test.shape= (12357, 1)


In [15]:
#Decision Tree
dtc=DecisionTreeClassifier(max_depth=4)
dtc.fit(X_train,y_train)
predictedOutput=dtc.predict(X_test)
train_pred_output=dtc.predict(X_train)
print('score=',dtc.score(X_test,y_test))
print('train acc_score=',accuracy_score(y_train,train_pred_output))
print('test acc_score=',accuracy_score(y_test,predictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,predictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,predictedOutput))


dict={}
for col,val in zip (X_train.columns,dtc.feature_importances_):
    dict[col]=val
print('feature_importence=',dict)


counts = []
categories = list(y_train.columns.values)
print(categories)
for i in categories:
    counts.append((i, y_train[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_comments'])
print(df_stats)
y_train.shape

score= 0.898761835397
train acc_score= 0.899691304499
test acc_score= 0.898761835397
cnt= 11106
total score= 0.898761835396941
confusion matrix= [[10865   103]
 [ 1148   241]]
feature_importence= {'age': 0.0, 'campaign': 0.0062858202190535705, 'previous': 0.0, 'emp.var.rate': 0.0017201446514099666, 'cons.price.idx': 0.0083219493898413818, 'cons.conf.idx': 0.067379307610442918, 'euribor3m': 0.057118004872808097, 'nr.employed': 0.70737599349467695, 'contact_cellular': 0.0, 'contact_telephone': 0.020641225206789751, 'marital_divorced': 0.0, 'marital_married': 0.0, 'marital_single': 0.0, 'marital_unknown': 0.0, 'job_admin.': 0.0, 'job_blue-collar': 0.0, 'job_entrepreneur': 0.0, 'job_housemaid': 0.0, 'job_management': 0.0, 'job_retired': 0.0, 'job_self-employed': 0.0, 'job_services': 0.0, 'job_student': 0.0, 'job_technician': 0.0, 'job_unemployed': 0.0, 'job_unknown': 0.0, 'default_no': 0.0, 'default_unknown': 0.0, 'default_yes': 0.0, 'housing_no': 0.0, 'housing_unknown': 0.0, 'housing_yes'

(28831, 1)

In [16]:
#Random Forest
rfc=RandomForestClassifier(max_depth=100)
rfc.fit(X_train,y_train)
rfcPredictedOutput=rfc.predict(X_test)
print('score=',rfc.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,rfcPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,rfcPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,rfcPredictedOutput))

  This is separate from the ipykernel package so we can avoid doing imports until


score= 0.888888888889
acc_score= 0.888888888889
cnt= 10984
total score= 0.8888888888888888
confusion matrix= [[10606   362]
 [ 1011   378]]


In [17]:
#Support Vector machine.(Support Vector Classifier)
svc = SVC(gamma='auto')
svc.fit(X_train,y_train)
svcPredictedOutput=svc.predict(X_test)
print('score=',svc.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,svcPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,svcPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,svcPredictedOutput))

  y = column_or_1d(y, warn=True)


score= 0.89592943271
acc_score= 0.89592943271
cnt= 11071
total score= 0.8959294327102048
confusion matrix= [[10793   175]
 [ 1111   278]]


In [18]:
#Logistic Regression
lr=LogisticRegression()
lr.fit(X_train,y_train)
lrPredictedOutput=lr.predict(X_test)
print('score=',lr.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,lrPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,lrPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,lrPredictedOutput))

  y = column_or_1d(y, warn=True)


score= 0.898276280651
acc_score= 0.898276280651
cnt= 11100
total score= 0.8982762806506434
confusion matrix= [[10838   130]
 [ 1127   262]]


In [20]:
#naive bayes gaussian
gnb = GaussianNB()
gnb.fit(X_train,y_train)
gnbPredictedOutput=gnb.predict(X_test)
print('score=',gnb.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,gnbPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,gnbPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,gnbPredictedOutput))

score= 0.776078336166
acc_score= 0.776078336166
cnt= 9590
total score= 0.7760783361657361
confusion matrix= [[8723 2245]
 [ 522  867]]


  y = column_or_1d(y, warn=True)
