In [1]:
import pandas as pd

# load data
df = pd.read_csv('data/data.csv', index_col='Sample Code Number', na_values='?')
df = df.dropna()

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Define Classifiers

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

clf_LR = LogisticRegression()
clf_DT = DecisionTreeClassifier()
clf_SVM_RBF = SVC(kernel='rbf')

In [3]:
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True)
acc_per_fold = []

for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    clf_ensemble = VotingClassifier(estimators=[
        ('LR', clf_LR), ('DT', clf_DT), ('SVM', clf_SVM_RBF)], voting='hard')

    clf_ensemble.fit(X_train, y_train)
    y_pred = clf_ensemble.predict(X_test)
    acc_per_fold.append(accuracy_score(y_test, y_pred))


In [4]:
print('Accuracy of ensemble classifier per each fold')
acc_per_fold

Accuracy of ensemble classifier per each fold


[0.927536231884058,
 0.9565217391304348,
 0.9565217391304348,
 0.9705882352941176,
 0.9558823529411765,
 0.9852941176470589,
 1.0,
 0.9852941176470589,
 0.9852941176470589,
 0.9705882352941176]

In [5]:
import numpy as np

print('Average accuracy of ensemble classifier : %.4f' % np.mean(acc_per_fold))

Average accuracy of ensemble classifier : 0.9694
