[Implementation of Voting Classifiers in Scikit-learn and Python - Ensemble Machine Learning Tutorial](https://www.youtube.com/watch?v=ngST76LTQf0)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# models classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# scaling
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn import metrics

## import dataset

In [2]:
train_knight_url = "../../subject/data04/Train_knight.csv"
test_knight_url = "../../subject/data04/Test_knight.csv"

train_knight_df = pd.read_csv(train_knight_url)
test_knight_df = pd.read_csv(test_knight_url)

train_knight_df.shape, test_knight_df.shape

((398, 31), (171, 30))

## split to train and test

In [3]:
X = train_knight_df.drop(columns=["knight"])
y = train_knight_df["knight"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (278, 30)
X_test shape: (120, 30)
y_train shape: (278,)
y_test shape: (120,)


## Scale to standard

In [4]:
scaler = StandardScaler()
# fit_transform performs both fit and transform at the same time
X_train_scaled = scaler.fit_transform(X_train)
# here we only transform the test set
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape

((278, 30), (120, 30))

In [5]:
X_train_scaled

array([[ 2.24739539e-02,  1.97890769e+00,  2.12920113e-02, ...,
        -1.34505617e-01, -1.06911587e+00,  2.12273207e-03],
       [-1.96971529e-01,  6.35285243e-01, -2.40155079e-01, ...,
        -8.85939520e-01, -4.16187050e-01, -9.68568420e-01],
       [ 6.95440100e-01,  1.04944843e+00,  7.71568514e-01, ...,
         1.71859895e+00,  3.27362002e+00,  7.56845317e-01],
       ...,
       [ 1.95066826e+00, -3.80859328e-01,  1.87015726e+00, ...,
         1.50858043e+00, -3.28455012e-01, -7.36839985e-01],
       [ 1.62919063e-01, -1.19233023e+00,  1.26892790e-01, ...,
        -1.02076288e-01, -4.68488842e-01, -2.96147369e-01],
       [ 9.64626559e-01,  1.56233657e+00,  9.16343776e-01, ...,
         9.32573763e-01, -6.47327228e-01, -2.17931726e-01]],
      shape=(278, 30))

In [6]:
X_test_scaled

array([[-0.48663957, -0.58071715, -0.49776988, ..., -0.41555981,
        -0.44824299, -0.41347083],
       [ 2.08233555,  1.83924801,  2.20654684, ...,  2.73008519,
         2.0487458 ,  2.63985774],
       [-0.72656663, -1.04544677, -0.71919087, ..., -0.6053486 ,
         0.09670794,  0.50352003],
       ...,
       [ 1.20162768,  0.70029923,  1.12924857, ..., -0.33510419,
        -0.86834448, -1.0269383 ],
       [-0.0038595 , -0.45309709,  0.02342106, ...,  0.36892112,
         0.31097658, -0.21034364],
       [ 1.47081414, -0.00522295,  1.38473433, ...,  0.12801753,
        -0.07200751, -1.03686118]], shape=(120, 30))

In [7]:
# All classifiers model
clf1 = DecisionTreeClassifier(random_state=42)
clf2 = LogisticRegression(random_state=42, max_iter=1000)
clf3 = KNeighborsClassifier(n_neighbors=5)

classifier_type_name_initial = [
    ('Decision Tree', clf1),
    ('Logistic Regression', clf2),
    ('K-Nearest Neighbors', clf3)
]

# Voting Classifier is an ensemble method that combines multiple classifiers
# to improve the overall performance and return the best prediction.
voting_clf = VotingClassifier(
    estimators=classifier_type_name_initial,
    voting='hard'
)

classifier_type_name_total = classifier_type_name_initial + [('Voting Classifier', voting_clf)]

classifier_scores = {}

for name, clf in classifier_type_name_total:
    clf.fit(X_train_scaled, y_train)
    clf.predictions = clf.predict(X_test_scaled)
    classifier_scores[name] = metrics.accuracy_score(y_test, clf.predictions)

classifier_scores

{'Decision Tree': 0.9,
 'Logistic Regression': 0.9833333333333333,
 'K-Nearest Neighbors': 0.9666666666666667,
 'Voting Classifier': 0.9833333333333333}

In [8]:
# re train with voting classifier
voting_clf.fit(X_train_scaled, y_train)

test_knight_df_scaled = scaler.transform(test_knight_df)

y_pred = voting_clf.predict(test_knight_df_scaled)

In [9]:
# --- Export to KNN.txt ---
import os
filename = os.path.join(os.getcwd(), 'Voting.txt')

with open(filename, "w") as f:
    for k in y_pred:
        f.write(k + "\n")