In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [2]:
#Different models initialised
log_clf_1 = LogisticRegression(random_state=0)
log_clf_2 = LogisticRegression(random_state=42)
decision_clf1 = DecisionTreeClassifier(criterion = 'entropy',random_state=0)
decision_clf2 = DecisionTreeClassifier(criterion = 'entropy', random_state=42)

In [3]:
#Creation of list of models
Model_List=[('Logistic Regression 1', log_clf_1),
            ('Logistic Regression 2', log_clf_2),
            ('Decision Tree 1', decision_clf1),
            ('Decision Tree 2', decision_clf2)]

In [4]:
df = pd.read_csv('BankdatanewEnsemble.csv')

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1


In [6]:
X = df.drop('deposit',1)
y = df['deposit']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [8]:
#Naive aggregation classifier - hard voting - PREDICTED VALUES
voting_clf_hard = VotingClassifier(estimators=Model_List, voting='hard')
voting_clf_hard.fit(X_train,y_train)
hard_voting_score = voting_clf_hard.score(X_test,y_test)
print(hard_voting_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7432069274410271


In [9]:
#Naive aggregation classifier - soft voting - PROBABILITY VALUES
voting_clf_soft = VotingClassifier(estimators=Model_List, voting='soft')
voting_clf_soft.fit(X_train,y_train)
soft_voting_score = voting_clf_soft.score(X_test,y_test)
print(soft_voting_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.78501045088086


In [10]:
X = df.drop('deposit',1).copy()
y = df['deposit'].copy()

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [12]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [13]:
#Bagging classifier(BOOTSTRAP) - Repeatation of rows in samples - Resampling with REPLACEMENT
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=100,max_samples=100,random_state=0)
bagging_clf.fit(X_train,y_train)

score_bagging = bagging_clf.score(X_test,y_test)
print(score_bagging)

0.8139743206927441


In [14]:
#Pasting classifier - No repeatation of rows in samples - Resampling without NO REPLACEMENT
pasting_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, max_samples=100, bootstrap=False, random_state=0)
pasting_clf.fit(X_train,y_train)

score_pasting = pasting_clf.score(X_test,y_test)
print(score_pasting)

0.8112869513287548


In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
#Random Forest classifier
rcf = RandomForestClassifier(n_estimators=100, n_jobs=100, min_samples_leaf=100, random_state=0)
rcf.fit(X_train,y_train)

score_rcf = rcf.score(X_test,y_test)
print(score_rcf)

0.8220364287847118


In [17]:
from mlxtend.classifier import StackingClassifier

In [18]:
#Stacking classifier

classifier1 = DecisionTreeClassifier(random_state=0)
classifier2= DecisionTreeClassifier(random_state=1)
classifier3 = DecisionTreeClassifier(random_state=2)
classifier4= DecisionTreeClassifier(random_state=3)

classifier_list=[classifier1,classifier2,classifier3,classifier4]

In [19]:
meta_classifier = LogisticRegression(random_state=0)

In [20]:
sclf = StackingClassifier(classifiers=classifier_list, meta_classifier=meta_classifier)
sclf.fit(X_train,y_train)

sclf_score = sclf.score(X_test,y_test)
print(sclf_score)

0.766497461928934


In [21]:
#Hyperparameter tuning - Grid and Random search

from sklearn.model_selection import GridSearchCV

parameter_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier(random_state=0)

grid_search = GridSearchCV(estimator=clf, param_grid=parameter_grid)
grid_search.fit(X_train, y_train)

score_gs = grid_search.score(X_test, y_test)
print(score_gs)

0.8450283666766198


In [22]:
from sklearn.model_selection import RandomizedSearchCV

parameter_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier(random_state=0)

random_search = RandomizedSearchCV(estimator=clf, param_distributions=parameter_grid, n_iter=20, random_state=0)

random_search.fit(X_train, y_train)
score_rs = random_search.score(X_test, y_test)
print(score_rs)

0.8438339802926247
