In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib notebook
import joblib
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV

# Function definitions

In [4]:
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Std dev:",scores.std())
    return
    
def display_cvgrid_scores(grid_search):
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)
    return

def apply_grid_search(classifier, X_data, y_data, params_grid, n_cvs=3, score='accuracy', display_scores=True):
    grid_search = GridSearchCV(
        classifier,
        params_grid,
        cv=n_cvs,
        scoring=score,
        return_train_score=True
    )
    
    grid_search.fit(X_data, y_data)
    if(display_scores):
        display_cvgrid_scores(grid_search)
    return grid_search.best_estimator_, grid_search.cv_results_["mean_test_score"].max()

# Import train data

In [6]:
train_set = pd.read_csv('datasets/mnist_train.csv')
X_train = train_set.iloc[:,1:].values
y_train = train_set.iloc[:,0].values

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

# Split original dataset in 30% for slower models
split = StratifiedShuffleSplit(n_splits=1, test_size=0.7, random_state=42)
for train_idx, test_idx in split.split(train_set, train_set.label):
    strat_train_set = train_set.loc[train_idx]
    strat_test_set = train_set.loc[test_idx]

strat_X_train = strat_train_set.iloc[:,1:].values
strat_y_train = strat_train_set.iloc[:,0].values

In [11]:
# Dimension reduction with PCA
pca = joblib.load('models/pca_transformer.pkl')
X_train_reduced = pca.transform(X_train)
strat_X_train_reduced = pca.transform(strat_X_train)

# Models

In [5]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

## SGD Classifier

In [None]:
params_grid = [
    {'penalty': ['l2'], 'alpha':[0.001, 0.01, 0.1, 1.0, 10.00]},
    {'penalty':['elasticnet'], 'l1_ratio':[0.15,0.5,0.85], 'alpha':[0.01]}
]

sgd_clf, sgd_acc = apply_grid_search(
    SGDClassifier(random_state=42,n_jobs=-1,max_iter=10000),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
sgd_clf.fit(X_train_reduced, y_train)
joblib.dump(sgd_clf, 'models/sgd_clf.pkl')

## Logit Classifier

In [None]:
params_grid = [
    {'C':[0.1,1.0,10.0]}
]

logit_clf, logit_acc = apply_grid_search(
    LogisticRegression(solver='sag', max_iter=300, n_jobs=-1),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
logit_clf.fit(X_train_reduced, y_train)
joblib.dump(logit_clf, 'models/logit_clf.pkl')

## KNN Classifier

In [None]:
params_grid = [
    {'n_neighbors': [1,3,5], 'weights':['uniform','distance']}
]

knn_clf, knn_acc = apply_grid_search(
    KNeighborsClassifier(n_jobs=-1),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
knn_clf.fit(X_train_reduced, y_train)
joblib.dump(knn_clf, 'models/knn_clf.pkl')

## Gaussian Naive-Bayes Classifier

In [None]:
params_grid = [
    {'var_smoothing': np.logspace(-10,-1,10)}
]

gnb_clf, gnb_acc = apply_grid_search(
    GaussianNB(),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
gnb_clf.fit(X_train_reduced, y_train)
joblib.dump(gnb_clf, 'models/gnb_clf.pkl')

## Decision Tree Classifier

In [None]:
params_grid = [
    {'splitter': ['best'], 'max_depth':[10,30,50]},
    {'splitter': ['random'], 'max_depth':[10,30,50]}
]

tree_clf, tree_acc = apply_grid_search(
    DecisionTreeClassifier(),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
tree_clf.fit(X_train_reduced, y_train)
joblib.dump(tree_clf, 'models/tree_clf.pkl')

## SVM Classifier

In [None]:
params_grid = [
    {'kernel': ['rbf'], 'C':[10.0,100.0,1000.0]},
    {'kernel': ['poly'], 'degree':[3], 'C':[10.0,100.0,1000.0]}
]

svc_clf, svc_acc = apply_grid_search(
    SVC(random_state=42),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
svc_clf.fit(X_train_reduced, y_train)
joblib.dump(svc_clf, 'models/svc_clf.pkl')

## Random Forest Classifier

In [None]:
params_grid = [
    {'max_features':['sqrt', 'log2', 10]}
]

rf_clf, rf_acc = apply_grid_search(
    RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1),
    X_train_reduced,
    y_train,
    params_grid
)

In [None]:
rf_clf.fit(X_train_reduced, y_train)
joblib.dump(rf_clf, 'models/rf_clf.pkl')

## Hard-voting Ensemble

In [None]:
hve_clf = VotingClassifier(
    estimators=[
        ('sgd', SGDClassifier(alpha=1.0, max_iter=10000, random_state=42)), 
        ('logit', LogisticRegression(C=0.1, max_iter=300, solver='sag')), 
        ('knn', KNeighborsClassifier(n_neighbors=3, weights='distance')),
        ('gnb', GaussianNB(var_smoothing=0.001)),
        ('tree', DecisionTreeClassifier(max_depth=30)),
        ('svc', SVC(C=10.0, random_state=42)),
    ],
    n_jobs=-1,
    voting='hard'
)

hve_scores = cross_val_score(
    ehv_clf,
    X_train_reduced,
    y_train,
    scoring = "accuracy",
    cv=3
)

display_scores(hve_scores)
hve_acc = hve_scores.mean()

In [None]:
hve_clf.fit(X_train_reduced, y_train)
joblib.dump(hve_clf, 'models/hve_clf.pkl')

## Soft-voting Ensemble

In [None]:
sve_clf = VotingClassifier(
    estimators=[
        ('logit', LogisticRegression(C=0.1, max_iter=300, solver='sag')), 
        ('knn', KNeighborsClassifier(n_neighbors=3, weights='distance')),
        ('svc', SVC(C=10.0, random_state=42, probability=True)),
    ],
    n_jobs=-1,
    voting='soft'
)

sve_scores = cross_val_score(
    sve_clf,
    X_train_reduced,
    y_train,
    scoring = "accuracy",
    cv=3
)

display_scores(sve_scores)
sve_acc = sve_scores.mean()

In [None]:
sve_clf.fit(X_train_reduced, y_train)
joblib.dump(sve_clf, 'models/sve_clf.pkl')

# Show accuracies

In [None]:
models = [sgd_clf, logit_clf, knn_clf, gnb_clf, tree_clf, svc_clf, rf_clf, ehv_clf, esv_clf]
accs = [sgd_acc, logit_acc, knn_acc, gnb_acc, tree_acc, svc_acc, rf_acc, ehv_acc, esv_acc]
names = ['SGD', 'Logit', 'KNN', 'GNB', 'Tree', 'SVC', 'RF', 'HVE', 'SVE']

fig, axs = plt.subplots(1,1,figsize=(4,4),constrained_layout=True)

axs.bar(np.arange(len(models)), accs)
axs.set_xticks(names)