In [1]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# calculate accuracy measures and confusion matrix
from sklearn import metrics




In [2]:
#Load the file from local directory using pd.read_csv which is a special form of read_table
#while reading the data, supply the "colnames" list

bank_df = pd.read_csv("bank-full.csv")


In [3]:
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
bank_df.groupby(["Target"]).count()  # count the instances of each class in the data to check if data is skewed towards a class

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
no,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922
yes,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289


In [5]:
bank_df.shape

(45211, 17)

In [6]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
Target       45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [7]:
bank_df.describe(include='all')

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


In [8]:
bank_df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'Target'],
      dtype='object')

In [9]:
for feature in bank_df.columns: # Loop through all columns in the dataframe
    if bank_df[feature].dtype == 'object': # Only apply for columns with categorical strings
        bank_df[feature] = pd.Categorical(bank_df[feature]).codes # Replace strings with an integer

In [10]:
bank_df.groupby("Target").count()
bank_df.head(10)

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922,39922
1,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289


In [11]:
X = bank_df.drop("Target" , axis=1)
y = bank_df["Target"]   # select all rows and the 17 th column which is the classification "Yes", "No"
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [12]:
#instantiating decision tree as the default model

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [13]:
# Is the model an overfit model? 
y_pred = dt_model.predict(X_test)
print(dt_model.score(X_train, y_train))
print(dt_model.score(X_test , y_test))

1.0
0.8763639044529637


In [14]:
# Note: - Decision Tree is a non-parametric algorithm and hence prone to overfitting easily. This is evident from the difference
# in scores in training and testing

# In ensemble techniques, we want multiple instances (each different from the other) and each instance to be overfit!!!  
# hopefully, the different instances will do different mistakes in classification and when we club them, their
# errors will get cancelled out giving us the benefit of lower bias and lower overall variance errors.



In [15]:
# Let us look at the class level scores for the overfit model - 

from sklearn.metrics import confusion_matrix

print("Confusion Matrix:\n")


print(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1]), index=['true:no', 'true:yes'], columns=['pred:no', 'pred:yes']))

Confusion Matrix:

          pred:no  pred:yes
true:no     11156       874
true:yes      803       731


In [16]:
print(metrics.classification_report(y_test, y_pred))  # precision and recall for class =1  is poor which is expected given the 
                                                      # small sample size for that class

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     12030
           1       0.46      0.48      0.47      1534

    accuracy                           0.88     13564
   macro avg       0.69      0.70      0.70     13564
weighted avg       0.88      0.88      0.88     13564



In [17]:
# The misclassifications happen only in the overlap region i.e. the part of the features space where data from the two classes
# overlap and in this case that region is likely to be dominated by data points from the calss 0 hence class 1 is misclassified

# Let us use random forest which gives us ensemble instances which are very dissimilar. Hope the instances will do different
# mistakes in the classifications which can be cancelled overall using voting. Hope the majority do not do the same missclassification
# which is likely when the region is dominated by one class


In [18]:
rfcl = RandomForestClassifier(random_state=1)

enclf = VotingClassifier(estimators = [('rf', rfcl)], voting = 'hard')

In [19]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(X_train, y_train)

y_pred = rfcl.predict(X_test)

print(rfcl.score(X_train, y_train))
print(rfcl.score(X_test , y_test))


0.9997472114260435
0.9072544971984665


In [20]:
from sklearn.metrics import confusion_matrix

print("Confusion Matrix:\n")


print(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1]), index=['true:no', 'true:yes'], columns=['pred:no', 'pred:yes']))

Confusion Matrix:

          pred:no  pred:yes
true:no     11649       381
true:yes      877       657


In [21]:
print(metrics.classification_report(y_test, y_pred))  # precision and recall for class =1  is poor which is expected given the 
                                                      # small sample size for that class

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     12030
           1       0.63      0.43      0.51      1534

    accuracy                           0.91     13564
   macro avg       0.78      0.70      0.73     13564
weighted avg       0.90      0.91      0.90     13564



In [22]:
# As we feared, the random forest has helped the larger class and worsened the case for underrepresented class!

In [23]:
# Let us try the gradient boosting method  


from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=50)

abcl = abcl.fit(X_train, y_train)

y_pred = abcl.predict(X_test)

print(abcl.score(X_train, y_train))
print(abcl.score(X_test , y_test))



0.8965146775365753
0.9009879091713359


In [24]:

print("Confusion Matrix:\n")


print(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1]), index=['true:no', 'true:yes'], columns=['pred:no', 'pred:yes']))

Confusion Matrix:

          pred:no  pred:yes
true:no     11673       357
true:yes      986       548


In [25]:
print(metrics.classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.92      0.97      0.95     12030
           1       0.61      0.36      0.45      1534

    accuracy                           0.90     13564
   macro avg       0.76      0.66      0.70     13564
weighted avg       0.89      0.90      0.89     13564



In [26]:
# Precision and recall both have gone down for the class label 1

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.12, max_depth=5)
gbcl = gbcl.fit(X_train, y_train)

In [28]:
y_pred = gbcl.predict(X_test)

print(gbcl.score(X_train, y_train))
print(gbcl.score(X_test , y_test))


0.9200240149145259
0.9080654674137423


In [29]:
print("Confusion Matrix:\n")


print(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1]), index=['true:no', 'true:yes'], columns=['pred:no', 'pred:yes']))

Confusion Matrix:

          pred:no  pred:yes
true:no     11653       377
true:yes      870       664


In [30]:
print(metrics.classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     12030
           1       0.64      0.43      0.52      1534

    accuracy                           0.91     13564
   macro avg       0.78      0.70      0.73     13564
weighted avg       0.90      0.91      0.90     13564



In [31]:
# Graident boost has given the best results on test data.... 

# Let us try SVC

In [32]:
from sklearn.svm import SVC

# Building a Support Vector Machine on train data
svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
svc_model.fit(X_train, y_train)

y_pred = svc_model.predict(X_test)


In [33]:
print(metrics.classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     12030
           1       0.00      0.00      0.00      1534

    accuracy                           0.89     13564
   macro avg       0.44      0.50      0.47     13564
weighted avg       0.79      0.89      0.83     13564



  'precision', 'predicted', average, warn_for)


In [34]:
print("Confusion Matrix:\n")


print(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1]), index=['true:no', 'true:yes'], columns=['pred:no', 'pred:yes']))

Confusion Matrix:

          pred:no  pred:yes
true:no     12030         0
true:yes     1534         0
