In [175]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import graphviz

In [142]:
#Read csv
bankdata=pd.read_csv('bankdata/bank-additional/bank-additional/bank-additional-full.csv',delimiter=';')


In [143]:
#Explore columns
bankdata.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [144]:
#Explore data
bankdata.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [145]:
#Explore column values
array=bankdata['job'].unique()
array=bankdata['month'].unique()
print(array)

['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']


In [146]:
#Create feature set. convert categorical values into numbers
featureSet=bankdata
#drop these column for now
featureSet=featureSet.drop(['education','month','day_of_week','duration','pdays',],axis=1)

contact_dummy=pd.get_dummies(bankdata['contact'],prefix='contact')
marital_dummy=pd.get_dummies(bankdata['marital'],prefix='marital')
job_dummy=pd.get_dummies(bankdata['job'],prefix='job')
default_dummy=pd.get_dummies(bankdata['default'],prefix='default')
housing_dummy=pd.get_dummies(bankdata['housing'],prefix='housing')
loan_dummy=pd.get_dummies(bankdata['loan'],prefix='loan')
poutcome_dummy=pd.get_dummies(bankdata['poutcome'],prefix='poutcome')
featureSet=pd.concat([featureSet,contact_dummy,marital_dummy,job_dummy,default_dummy,
                      housing_dummy,loan_dummy,poutcome_dummy],axis=1)

featureSet=featureSet.drop(['contact','marital','job','default','housing','loan','poutcome'],axis=1)
featureSet.columns

Index(['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'contact_cellular',
       'contact_telephone', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_unknown', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'default_no', 'default_unknown',
       'default_yes', 'housing_no', 'housing_unknown', 'housing_yes',
       'loan_no', 'loan_unknown', 'loan_yes', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [147]:
#create train , test data
print('size=',featureSet.shape)
X=featureSet
X=X.drop('y',axis=1)
Y=pd.DataFrame(featureSet['y'])
Y=pd.get_dummies(Y['y'],prefix='target')
Y=Y.drop('target_no',axis=1)
print(X.shape,Y.shape)
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.3, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)



size= (41188, 39)
(41188, 38) (41188, 1)
(28831, 38) (12357, 38) (28831, 1) (12357, 1)


In [190]:
#Decision Tree
dtc=DecisionTreeClassifier(max_depth=2)
dtc.fit(X_train,y_train)
predictedOutput=dtc.predict(X_test)
train_pred_output=dtc.predict(X_train)
print('score=',dtc.score(X_test,y_test))
print('train acc_score=',accuracy_score(y_train,train_pred_output))
print('test acc_score=',accuracy_score(y_test,predictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,predictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,predictedOutput))

# dot_data = export_graphviz(dtc,feature_names=X_train.columns)  
# graph = graphviz.Source(dot_data)  
# graph.render

score= 0.899490167516
train acc_score= 0.899171031182
test acc_score= 0.899490167516
cnt= 11115
total score= 0.8994901675163874
confusion matrix= [[10872    96]
 [ 1146   243]]




<bound method File.render of <graphviz.files.Source object at 0x0000020F2734BAC8>>

In [149]:
#Random Forest
rfc=RandomForestClassifier(max_depth=100)
rfc.fit(X_train,y_train)
rfcPredictedOutput=rfc.predict(X_test)
print('score=',rfc.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,rfcPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,rfcPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,rfcPredictedOutput))

  This is separate from the ipykernel package so we can avoid doing imports until


score= 0.888160556769
acc_score= 0.888160556769
cnt= 10975
total score= 0.8881605567694424
confusion matrix= [[10596   372]
 [ 1010   379]]


In [150]:
#Support Vector machine.(Support Vector Classifier)
svc = SVC(gamma='auto')
svc.fit(X_train,y_train)
svcPredictedOutput=svc.predict(X_test)
print('score=',svc.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,svcPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,svcPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,svcPredictedOutput))

  y = column_or_1d(y, warn=True)


score= 0.89592943271
acc_score= 0.89592943271
cnt= 11071
total score= 0.8959294327102048
confusion matrix= [[10793   175]
 [ 1111   278]]


In [151]:
#Logistic Regression
lr=LogisticRegression()
lr.fit(X_train,y_train)
lrPredictedOutput=lr.predict(X_test)
print('score=',lr.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,lrPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,lrPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,lrPredictedOutput))

  y = column_or_1d(y, warn=True)


score= 0.898276280651
acc_score= 0.898276280651
cnt= 11100
total score= 0.8982762806506434
confusion matrix= [[10838   130]
 [ 1127   262]]


In [152]:
#naive bayes gaussian
gnb = GaussianNB()
gnb.fit(X_train,y_train)
gnbPredictedOutput=gnb.predict(X_test)
print('score=',lr.score(X_test,y_test))
print('acc_score=',accuracy_score(y_test,gnbPredictedOutput))
cnt=0
for a,b in zip (y_test.target_yes,gnbPredictedOutput):
#     print(a,b)
    if a==b:
        cnt=cnt+1
print('cnt=',cnt)
print('total score=',cnt/12357)

print('confusion matrix=',confusion_matrix(y_test,gnbPredictedOutput))

score= 0.898276280651
acc_score= 0.776078336166
cnt= 9590
total score= 0.7760783361657361
confusion matrix= [[8723 2245]
 [ 522  867]]


  y = column_or_1d(y, warn=True)
