In [None]:
# import packages

import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn import svm
from sklearn import neighbors
from imblearn.pipeline import Pipeline
import torch
from torch import nn, optim
from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring 
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import pandas as pd
import multiprocessing
import seaborn as sns

In [None]:
# set random seed
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
device = 'cpu'

In [None]:
# define neural network class, code adapted from https://github.com/jlm429/pyperch/blob/master/pyperch/neural/backprop_nn.py

class BackpropModule(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_units=10, hidden_layers=1,
                 dropout_percent=0, activation=nn.ReLU(), output_activation=nn.Softmax(dim=-1)):
        """

        Initialize the neural network.

        PARAMETERS:

        input_dim {int}:
            Number of features/dimension of the input.  Must be greater than 0.

        output_dim {int}:
            Number of classes/output dimension of the model. Must be greater than 0.

        hidden_units {int}:
            Number of hidden units.

        hidden_layers {int}:
            Number of hidden layers.

        dropout_percent {float}:
            Probability of an element to be zeroed.

        activation {torch.nn.modules.activation}:
            Activation function.

        output_activation {torch.nn.modules.activation}:
            Output activation.

        """
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_units = hidden_units
        self.hidden_layers = hidden_layers
        self.dropout = nn.Dropout(dropout_percent)
        self.activation = activation
        self.output_activation = output_activation
        self.layers = nn.ModuleList()
        self.device = "cpu"

        # input layer
        self.layers.append(nn.Linear(self.input_dim, self.hidden_units, device=self.device))
        # hidden layers
        for layer in range(self.hidden_layers):
            self.layers.append(nn.Linear(self.hidden_units, self.hidden_units, device=self.device))
        # output layer
        self.layers.append(nn.Linear(self.hidden_units, self.output_dim, device=self.device))

    def forward(self, X, **kwargs):
        """
        Recipe for the forward pass.

        PARAMETERS:

        X {torch.tensor}:
            NN input data. Shape (batch_size, input_dim).

        RETURNS:

        X {torch.tensor}:
            NN output data. Shape (batch_size, output_dim).
        """
        X = self.activation(self.layers[0](X))
        X = self.dropout(X)
        for i in range(self.hidden_layers):
            X = self.activation(self.layers[i+1](X))
        X = self.output_activation(self.layers[self.hidden_layers+1](X))
        return X

In [None]:
# Load wine dataset
 
# fetch dataset 
wine = pd.read_csv('.//datasets//winequality//WineQT.csv')

X_wine = wine.iloc[:, :-1]
y_wine = wine.iloc[:, -1]

y_wine = y_wine.clip(lower=4, upper=7)
y_wine = y_wine - y_wine.min()

X_wine_train, X_wine_test, y_wine_train, y_wine_test = model_selection.train_test_split(X_wine, y_wine, test_size=0.2, random_state=seed, stratify=y_wine)

scaler_wine = preprocessing.StandardScaler().fit(X_wine_train)
X_wine_train = scaler_wine.transform(X_wine_train)
X_wine_test = scaler_wine.transform(X_wine_test)

X_train = X_wine_train.astype(np.float32)
X_test = X_wine_test.astype(np.float32)
y_train = y_wine_train.astype(np.int64)
y_test = y_wine_test.astype(np.int64)

wine_y = pd.concat([X_wine, y_wine], axis=1)
wine_y.iloc[:, :-1] = scaler_wine.transform(wine_y.iloc[:, :-1])

In [None]:
plt.figure()
plt.hist(y_train, bins=len(np.unique(y_train)))
plt.hist(y_test, bins=len(np.unique(y_train)))

col_names = X_wine.columns

fig, axes = plt.subplots(3, 4, figsize=(16, 10.5))
axes = axes.flatten()
for i, col in enumerate(col_names):
    sns.kdeplot(data=wine_y, x=col, hue='quality', ax=axes[i], common_norm=False)
    axes[i].set_title(f'{col} distribution');
    axes[i].set_xlabel(None)
axes[11].axis('off')
plt.tight_layout()
plt.rcParams.update({'font.size': 12})
# plt.savefig('wine_feature.png', dpi=500)

In [None]:
# initialize neural net for wine dataset and perform gridsearch

num_features = X_train.shape[1]
num_classes = len(np.unique(y_train))

NN = NeuralNetClassifier(
    module=BackpropModule,
    module__input_dim=num_features,
    module__output_dim=num_classes,
    module__hidden_units=128,
    module__hidden_layers=2,
    module__dropout_percent=0,
    max_epochs=500,
    verbose=0,
    callbacks=[EpochScoring(scoring='accuracy', name='train_acc', on_train=True),],
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.SGD,
    optimizer__weight_decay=0,
    optimizer__momentum=0,
    lr=0.05,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    device=device
)

NN.set_params(train_split=False, verbose=0)

default_params = {
    'module__input_dim': [num_features],
    'module__output_dim': [num_classes],
    'max_epochs': [500]
}

params = {
    'lr': [0.001, 0.01, 0.1],
    'module__hidden_units': [16, 32, 64, 128, 256],
    'module__hidden_layers': [1, 2, 3],
    'module__dropout_percent': [0, 0.1, 0.2, 0.3],
    'optimizer__weight_decay': [0, 1e-4, 1e-3, 1e-2],
    'optimizer__momentum': [0, 0.9], 
    **default_params,
}

folds = StratifiedKFold(n_splits=5)
gs_NN = GridSearchCV(NN, params, refit=False, cv=folds, scoring='accuracy', verbose=3, n_jobs=12)

# gs_NN.fit(X_train, y_train)
gs_NN.fit(X_train_bal, y_train_bal)

df = pd.DataFrame(gs_NN.cv_results_)
df.to_excel('NN_wine_temp.xlsx')

In [None]:
# Extract best params for NN 

NN_best_param = gs_NN.best_params_
print(NN_best_param)

In [None]:
NN_best = NeuralNetClassifier(
    module=BackpropModule,
    module__input_dim=num_features,
    module__output_dim=num_classes,
    module__hidden_units=128,
    module__hidden_layers=2,
    module__dropout_percent=0,
    max_epochs=500,
    verbose=0,
    callbacks=[EpochScoring(scoring='accuracy', name='train_acc', on_train=True),],
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.SGD,
    optimizer__weight_decay=0,
    optimizer__momentum=0,
    lr=0.05,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    device=device
)

# Vary architechture to understand its effects:
default_params = {
    'module__input_dim': [num_features],
    'module__output_dim': [num_classes],
    'max_epochs': [500],
    'module__dropout_percent': [0],
    'optimizer__weight_decay': [0],
    'optimizer__momentum': [0.9],
    'lr': [0.001]
}

params = {
    'module__hidden_units': [8, 16, 32, 64, 128, 256, 512],
    'module__hidden_layers': [0, 1, 2, 3],
    **default_params,
}

NN_best_arch = GridSearchCV(NN_best, params, refit=False, cv=folds, scoring='accuracy', verbose=3, n_jobs=12)

NN_best_arch.fit(X_train, y_train)

df = pd.DataFrame(NN_best_arch.cv_results_)
df.to_excel('NN_wine_arch.xlsx')

In [None]:
# Vary lr to understand its effects:

NN_lr_results = {}

for lr_range in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]:
    NN_lr = NeuralNetClassifier(
        module=BackpropModule,
        module__input_dim=num_features,
        module__output_dim=num_classes,
        module__hidden_units=128,
        module__hidden_layers=1,
        module__dropout_percent=0,
        max_epochs=500,
        verbose=0,
        callbacks=[EpochScoring(scoring='accuracy', name='train_acc', on_train=True),],
        criterion=nn.CrossEntropyLoss,
        optimizer=optim.SGD,
        optimizer__weight_decay=0.0001,
        optimizer__momentum=0.9,
        lr=lr_range,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
    )
    
    NN_lr.fit(X_train, y_train)

    NN_lr_results[str(lr_range)+'_train'] = NN_lr.history[:, 'train_acc']
    NN_lr_results[str(lr_range)+'_valid'] = NN_lr.history[:, 'valid_acc']

df = pd.DataFrame(NN_lr_results)
df.to_excel('NN_wine_lr.xlsx')

In [None]:
# Vary dropout and regularization to understand its effects:
default_params = {
    'module__input_dim': [num_features],
    'module__output_dim': [num_classes],
    'max_epochs': [500],
    'optimizer__momentum': [0.9],
    'lr': [0.1],
    'module__hidden_units': [128],
    'module__hidden_layers': [1]
}

params = {
    'module__dropout_percent': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'optimizer__weight_decay': [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1],
    **default_params,
}

NN_best_reg = GridSearchCV(NN_best, params, refit=False, cv=folds, scoring='accuracy', verbose=3, n_jobs=12)

NN_best_reg.fit(X_train, y_train)

df = pd.DataFrame(NN_best_reg.cv_results_)
df.to_excel('NN_wine_reg.xlsx')

In [None]:
# Hyperparam plots

NN_arch_data = pd.read_excel('NN_wine_arch.xlsx')

units =  [8, 16, 32, 64, 128, 256, 512]
layers = [1, 2, 3]

NN_reg_data = pd.read_excel('NN_wine_reg.xlsx')

decay =  [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2]
dps = [0, 0.1, 0.2, 0.3, 0.4, 0.5]

fig, ax = plt.subplots(2, 2, figsize=(12, 10.5))
ax = ax.flatten()
for layer in layers:
    val_scores = NN_arch_data.loc[NN_arch_data['param_module__hidden_layers'] == layer, ['mean_test_score']].values
    ax[0].plot(units, val_scores, label='layers = '+str(layer))
ax[0].set_xlabel('Number of units per layer')
ax[0].set_ylabel('Validation accuracy')
ax[0].set_xscale('log', base=2)
ax[0].set_xlim([4, 1024])
ax[0].set_ylim([0.38, 0.62])
ax[0].legend()
plt.rcParams.update({'font.size': 14})
ax[0].xaxis.set_major_formatter(ScalarFormatter())

for dp in dps:
    val_scores = NN_reg_data.loc[NN_reg_data['param_module__dropout_percent'] == dp, ['mean_test_score']].values
    ax[1].plot(decay, val_scores[:-1], label='dp = '+str(dp))
ax[1].set_xlabel('Weight decay')
ax[1].set_ylabel('Validation accuracy')
ax[1].set_xscale('log', base=10)
ax[1].set_ylim([0.38, 0.62])
ax[1].legend()
plt.rcParams.update({'font.size': 14})

NN_lr_data = pd.read_excel('NN_wine_lr.xlsx')

lrs = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]
for i in range(7):
    ax[2].plot(NN_lr_data.iloc[:300, 2*i+1], label='lr = '+str(lrs[i]))
ax[2].set_xlabel('Iteration')
ax[2].set_ylabel('Training accuracy')
ax[2].set_ylim([0.05, 0.78])
ax[2].legend()

for i in range(7):
    ax[3].plot(NN_lr_data.iloc[:300, 2*i+2], label='lr = '+str(lrs[i]))
ax[3].set_xlabel('Iteration')
ax[3].set_ylabel('Validation accuracy')
ax[3].set_ylim([0.05, 0.78])
ax[3].legend()

plt.tight_layout()

# plt.savefig('NN_hyperparams.png', dpi=500)

In [None]:
# Final NN results

num_features = X_train.shape[1]
num_classes = len(np.unique(y_train))

NN_best = NeuralNetClassifier(
    module=BackpropModule,
    module__input_dim=num_features,
    module__output_dim=num_classes,
    module__hidden_units=128,
    module__hidden_layers=1,
    module__dropout_percent=0,
    max_epochs=500,
    verbose=0,
    callbacks=[EpochScoring(scoring='accuracy', name='train_acc', on_train=True),],
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.SGD,
    optimizer__weight_decay=0.0001,
    optimizer__momentum=0.9,
    lr=0.05,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

NN_best.fit(X_train, y_train)

# plot the iterative learning curve (accuracy)
ax[0].plot(NN_best.history[:, 'train_acc'], label='Train Acc', color='cornflowerblue')
ax[0].plot(NN_best.history[:, 'valid_acc'], label='Validation Acc', color='chartreuse')
ax[0].set_xlabel("Iteration")
ax[0].set_ylabel("Accuracy")
ax[0].legend()

train_sizes, train_scores, val_scores = learning_curve(NN_best, X_train, y_train, train_sizes=[0.025, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 0.75, 1], cv=folds)

train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
val_scores_mean = val_scores.mean(axis=1)
val_scores_std = val_scores.std(axis=1)

ax[1].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='green')
ax[1].fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color='darkorchid')
ax[1].plot(train_sizes, train_scores_mean, label="Training score", color='cyan')
ax[1].plot(train_sizes, val_scores_mean, label="Validation score", color='darkorchid')
ax[1].set_xlabel("Training size")
ax[1].set_ylabel("Accuracy")
ax[1].legend()

# plt.savefig('NN_final_trains.png', dpi=500)

In [None]:
# test accuracy
y_test_prob = NN_best.predict_proba(X_test)
y_test_pred = np.argmax(y_test_prob, axis=1)
NN_test_acc = metrics.accuracy_score(y_test, y_test_pred)
NN_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
NN_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
print('Neural network on wine dataset, test accuracy is: {:.3f}'.format(NN_test_acc))
print('Neural network on wine dataset, test recall is: {:.3f}'.format(NN_test_recall))
print('Neural network on wine dataset, test f1 score is: {:.3f}'.format(NN_test_f1))

# confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_pred)
cm_plot = metrics.ConfusionMatrixDisplay(cm)
cm_plot.plot()
# cm_plot.figure_.savefig('NN_conf_mat.png', dpi=500)

%timeit NN_best.fit(X_train, y_train)
%timeit NN_best.predict(X_test)

In [None]:
# Perform grid search with SVM

SVM = svm.SVC(max_iter=1000000000)

params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 3, 4],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

folds = StratifiedKFold(n_splits=5)
gs_SVM = GridSearchCV(SVM, params, refit=False, cv=folds, scoring='accuracy', verbose=3, n_jobs=12)

gs_SVM.fit(X_train, y_train)

df = pd.DataFrame(gs_SVM.cv_results_)
df.to_excel('SVM_wine_GS.xlsx')

In [None]:
# Extract best params for SVM 

SVM_best_param = gs_SVM.best_params_
print(SVM_best_param)

In [None]:
# hyperparam effects

SVM_GS_data = pd.read_excel('SVM_wine_GS.xlsx')
SVM_GS_kernel = SVM_GS_data.loc[(SVM_GS_data['param_C'] == 10) & (SVM_GS_data['param_gamma'] == 0.1), ['param_degree', 'param_kernel', 'mean_test_score']]
kernels = ['linear', 'poly_2', 'poly_3', 'poly_4', 'rbf']
scores = []
scores.append(SVM_GS_kernel.loc[SVM_GS_kernel['param_kernel'] == 'linear', ['mean_test_score']].values[0].item())
for i in [2, 3, 4]:
    scores.append(SVM_GS_kernel.loc[(SVM_GS_kernel['param_kernel'] == 'poly') & (SVM_GS_kernel['param_degree'] == i), ['mean_test_score']].values[0].item())
scores.append(SVM_GS_kernel.loc[SVM_GS_kernel['param_kernel'] == 'rbf', ['mean_test_score']].values[0].item())

SVM_time = svm.SVC(C=10, gamma=0.1, kernel='linear')
%timeit SVM_time.fit(X_train, y_train)
SVM_time = svm.SVC(C=10, gamma=0.1, kernel='poly', degree=2)
%timeit SVM_time.fit(X_train, y_train)
SVM_time = svm.SVC(C=10, gamma=0.1, kernel='poly', degree=3)
%timeit SVM_time.fit(X_train, y_train)
SVM_time = svm.SVC(C=10, gamma=0.1, kernel='poly', degree=4)
%timeit SVM_time.fit(X_train, y_train)
SVM_time = svm.SVC(C=10, gamma=0.1, kernel='rbf')
%timeit SVM_time.fit(X_train, y_train)
fit_times = [98.6, 50.5, 44.3, 37.8, 31.3]

SVM_rbf = svm.SVC(kernel='rbf', gamma=0.01, max_iter=1000000000)
params = {'C': [1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]}
SVM_C = GridSearchCV(SVM_rbf, params, refit=False, cv=folds, scoring='accuracy', verbose=3, n_jobs=12)
SVM_C.fit(X_train, y_train)
df = pd.DataFrame(SVM_C.cv_results_)
df.to_excel('SVM_wine_C.xlsx')

In [None]:
SVM_C_data = pd.read_excel('SVM_wine_C.xlsx')

Cs = SVM_C_data['param_C']
C_scores = SVM_C_data['mean_test_score']
C_times = SVM_C_data['mean_fit_time'] * 1000

fig, ax = plt.subplots(2, 2, figsize=(12, 10.5))
ax = ax.flatten()
ax[0].bar(kernels, scores, label=kernels)
ax[0].set_xlabel('Kernel type')
ax[0].set_ylabel('Validation accuracy')
ax[0].set_ylim([0, 0.7])
plt.rcParams.update({'font.size': 14})

ax[1].bar(kernels, fit_times, label=kernels, color='orange')
ax[1].set_xlabel('Kernel type')
ax[1].set_ylabel('Model fit runtime (ms)')

ln1 = ax[2].plot(Cs, C_scores, label='Accuracy')
ax[2].set_xlabel('C')
ax[2].set_ylabel('Validation accuracy')
ax[2].set_xscale('log', base=10)
ax[2].set_ylim([0.25, 1.05])
ax2 = ax[2].twinx()
ln2 = ax2.plot(Cs, C_times, color='orange', label='Runtime')
ax2.set_ylabel('Model fir runtime (ms)')
lns = ln1 + ln2
labs = [l.get_label() for l in lns]
ax2.legend(lns, labs, loc=0)

SVM_best = svm.SVC(kernel='rbf', gamma=0.01, C=200)

train_sizes, train_scores, val_scores = learning_curve(SVM_best, X_train, y_train, train_sizes=np.linspace(0.05, 1, 20), cv=folds)

train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
val_scores_mean = val_scores.mean(axis=1)
val_scores_std = val_scores.std(axis=1)

ax[3].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='green')
ax[3].fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color='darkorchid')
ax[3].plot(train_sizes, train_scores_mean, label="Training score", color='cyan')
ax[3].plot(train_sizes, val_scores_mean, label="Validation score", color='darkorchid')
ax[3].set_xlabel("Training size")
ax[3].set_ylabel("Score")
ax[3].legend()

plt.tight_layout()

# plt.savefig('SVM_results.png', dpi=500)

In [None]:
# test accuracy
SVM_best.fit(X_train, y_train)
y_test_pred = SVM_best.predict(X_test)

SVM_test_acc = metrics.accuracy_score(y_test, y_test_pred)
SVM_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
SVM_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
print('SVM on wine dataset, test accuracy is: {:.3f}'.format(SVM_test_acc))
print('SVM on wine dataset, test recall is: {:.3f}'.format(SVM_test_recall))
print('SVM on wine dataset, test f1 score is: {:.3f}'.format(SVM_test_f1))


# confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_pred)
cm_plot = metrics.ConfusionMatrixDisplay(cm)
cm_plot.plot()
# cm_plot.figure_.savefig('SVM_conf_mat.png', dpi=500)

%timeit SVM_best.fit(X_train, y_train)
%timeit SVM_best.predict(X_test)

In [None]:
# Perform grid search with kNN

kNN = neighbors.KNeighborsClassifier()

params = {
    'n_neighbors': np.linspace(1, 100, 100, dtype='int').tolist(),
    'metric': ['euclidean', 'manhattan', 'cosine'],
    'weights': ['uniform', 'distance']
}

folds = StratifiedKFold(n_splits=5)
gs_kNN = GridSearchCV(kNN, params, refit=False, cv=folds, scoring='accuracy', verbose=3, n_jobs=12)

gs_kNN.fit(X_train, y_train)

df = pd.DataFrame(gs_kNN.cv_results_)
df.to_excel('kNN_wine_GS.xlsx')

In [None]:
# hyperparam effects

kNN_GS_data = pd.read_excel('kNN_wine_GS.xlsx')
k = np.linspace(1, 100, 100)
euc = kNN_GS_data.loc[(kNN_GS_data['param_weights'] == 'uniform') & (kNN_GS_data['param_metric'] == 'euclidean'), ['mean_test_score']].values
man = kNN_GS_data.loc[(kNN_GS_data['param_weights'] == 'uniform') & (kNN_GS_data['param_metric'] == 'manhattan'), ['mean_test_score']].values
cos = kNN_GS_data.loc[(kNN_GS_data['param_weights'] == 'uniform') & (kNN_GS_data['param_metric'] == 'cosine'), ['mean_test_score']].values

fig, ax = plt.subplots(2, 2, figsize=(12, 10.5))
ax = ax.flatten()

ax[0].plot(k, euc, label="euclidean")
ax[0].plot(k, man, label="manhattan")
ax[0].plot(k, cos, label="cosine", color='red')
ax[0].set_xlabel("k")
ax[0].set_ylabel("Validation accuracy")
ax[0].set_ylim([0.5, 0.7])
ax[0].set_yticks([0.55, 0.6, 0.65, 0.7])
ax[0].legend()

euc = kNN_GS_data.loc[(kNN_GS_data['param_weights'] == 'distance') & (kNN_GS_data['param_metric'] == 'euclidean'), ['mean_test_score']].values
man = kNN_GS_data.loc[(kNN_GS_data['param_weights'] == 'distance') & (kNN_GS_data['param_metric'] == 'manhattan'), ['mean_test_score']].values
cos = kNN_GS_data.loc[(kNN_GS_data['param_weights'] == 'distance') & (kNN_GS_data['param_metric'] == 'cosine'), ['mean_test_score']].values

ax[1].plot(k, euc, label="euclidean")
ax[1].plot(k, man, label="manhattan")
ax[1].plot(k, cos, label="cosine", color='red')
ax[1].set_xlabel("k")
ax[1].set_ylabel("Validation accuracy")
ax[1].set_ylim([0.5, 0.7])
ax[1].set_yticks([0.55, 0.6, 0.65, 0.7])
ax[1].legend()

kNN_best = neighbors.KNeighborsClassifier(n_neighbors=48, metric='manhattan')#, weights='distance')

train_sizes, train_scores, val_scores = learning_curve(kNN_best, X_train, y_train, train_sizes=np.linspace(0.1, 1, 10), cv=folds, scoring='accuracy')

train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
val_scores_mean = val_scores.mean(axis=1)
val_scores_std = val_scores.std(axis=1)

ax[2].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='green')
ax[2].fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color='darkorchid')
ax[2].plot(train_sizes, train_scores_mean, label="Training score", color='cyan')
ax[2].plot(train_sizes, val_scores_mean, label="Validation score", color='darkorchid')
ax[2].set_xlabel("Training size")
ax[2].set_ylabel("Score")
ax[2].legend()

ax[3].axis('off')

plt.tight_layout()

# plt.savefig('kNN_results.png', dpi=500)

In [None]:
# test accuracy
kNN_best.fit(X_train, y_train)
y_test_pred = kNN_best.predict(X_test)
kNN_test_acc = metrics.accuracy_score(y_test, y_test_pred)
kNN_test_recall = metrics.recall_score(y_test, y_test_pred, average='macro')
kNN_test_f1 = metrics.f1_score(y_test, y_test_pred, average='macro')
print('kNN on wine dataset, test accuracy is: {:.3f}'.format(kNN_test_acc))
print('kNN on wine dataset, test recall is: {:.3f}'.format(kNN_test_recall))
print('kNN on wine dataset, test f1 score is: {:.3f}'.format(kNN_test_f1))

# confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_pred)
cm_plot = metrics.ConfusionMatrixDisplay(cm)
cm_plot.plot()
# cm_plot.figure_.savefig('kNN_conf_mat.png', dpi=500)

%timeit kNN_best.fit(X_train, y_train)
%timeit kNN_best.predict(X_test)