Mojority voting on ensemble of multiple classifiers:
    we can combine different classifiers such as Logistic regression, decision trees Random forest, etc, to classify, and finally a Voting Classifier can be used to see the majority voting of all the classifiers and make the decision.
    Here I have also applied GridSearch algo to find the best parameters for the classifiers.
    Pipeline module from sklearn learn is used to form the ensemble.
    we can also perform Normalization of the data if needed.
    

# Mojority voting on ensemble of multiple classifiers: 

In [1]:
#create an artificial data:
from sklearn.datasets import make_classification

x,y=make_classification(n_samples=2000, n_features=25, n_informative=20, n_redundant=5, n_classes=2, random_state=1)


In [7]:
import pandas as pd

In [8]:
df=pd.DataFrame(x)

In [12]:
y1=pd.Series(y)

In [13]:
y1.value_counts()

1    1004
0     996
dtype: int64

In [14]:
#the dataset is balanced and there does not need any handling

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
#Hyperparameter tuning of Logistic Regression 

log_class=LogisticRegression()
grid={'C':10.0**np.arange(-2,3),
     'penalty': ['l1', 'l2']}
cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

clf=GridSearchCV(log_class, grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(x,y)
clf.best_params_

In [49]:
#Hyperparameter tuning of SVC

svm=SVC()
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

clf=GridSearchCV(svm, param_grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(x,y)
clf.best_params_

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

In [56]:
# Hyperparameter tuning of Random Forest C

rf=RandomForestClassifier()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               
               'min_samples_split': min_samples_split,
               
               
              'oob_score': [True, False]}
cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

clf=GridSearchCV(rf, random_grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(x,y)
clf.best_params_

{'min_samples_split': 5, 'n_estimators': 800, 'oob_score': True}

In [59]:
models=list()
dt=Pipeline([('m', DecisionTreeClassifier())])
models.append(('Decision', dt))
rf=Pipeline([('m', RandomForestClassifier(min_samples_split= 5, n_estimators= 800, oob_score= True))])
models.append(('RandomForest', rf))
svm=Pipeline([('m', SVC(C= 100, gamma= 0.001, kernel= 'rbf'))])
models.append(('SVC', svm))
lr=Pipeline([('m', LogisticRegression(C=0.01, penalty='l2'))])
models.append(('LR', lr))
ensemble=VotingClassifier(estimators=models, voting='hard')

In [60]:
models

[('Decision', Pipeline(steps=[('m', DecisionTreeClassifier())])),
 ('RandomForest',
  Pipeline(steps=[('m',
                   RandomForestClassifier(min_samples_split=5, n_estimators=800,
                                          oob_score=True))])),
 ('SVC', Pipeline(steps=[('m', SVC(C=100, gamma=0.001))])),
 ('LR', Pipeline(steps=[('m', LogisticRegression(C=0.01))]))]

In [61]:
ensemble

In [57]:
#performance without using grid search:

import time

start=time.time()
cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores=cross_val_score(ensemble, x, y ,scoring='accuracy', cv=cv, n_jobs=-1)
end=time.time()
time1=end-start

print(time1)

print(n_scores)
print("average accuracy")
print(np.mean(n_scores))
#average accuracy
print("Minimum accuracy:" + str(np.min(n_scores)))

print("Maximum accuracy:" + str(np.max(n_scores)))


2.1043152809143066
[0.9025 0.885  0.9075 0.9    0.93   0.8875 0.9125 0.9175 0.9125 0.9125
 0.92   0.875  0.9125 0.9125 0.905 ]
average accuracy
0.9061666666666665
Minimum accuracy:0.875
Maximum accuracy:0.93


In [62]:
#performance with using grid search:

import time

start=time.time()
cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores=cross_val_score(ensemble, x, y ,scoring='accuracy', cv=cv, n_jobs=-1)
end=time.time()
time1=end-start

print(time1)

print(n_scores)
print("average accuracy")
print(np.mean(n_scores))
#average accuracy
print("Minimum accuracy:" + str(np.min(n_scores)))

print("Maximum accuracy:" + str(np.max(n_scores)))

13.637564897537231
[0.9125 0.8925 0.9075 0.8975 0.94   0.89   0.93   0.9275 0.9075 0.915
 0.93   0.89   0.9175 0.93   0.91  ]
average accuracy
0.9131666666666668
Minimum accuracy:0.89
Maximum accuracy:0.94
