# Support Vector Machine 
HOML pp 151/221 <br>
Default OvO strategy

In [None]:
from sklearn.svm import LinearSVC, SVC

In [None]:
# load basic libraries
import pandas as pd
import numpy as np
import mne
from pathlib import Path
import pickle
import time

from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score

%matplotlib widget
import matplotlib
import matplotlib.pyplot as plt

# set directories
# %cd D:/Programy/Anaconda3/Projects/EEG ML project # working directory
%cd D:
pkls = './Pickles/' # objects & variables

In [None]:
# to see all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [None]:
# load split sets, eeg only
with open(pkls +'xy_train.pkl', 'rb') as handle:
    x_train = pickle.load(handle)
    y_train = pickle.load(handle)
y_train.shape
x_train.shape

In [None]:
# susbet
y_train = y_train.iloc[:500]
x_train = x_train.iloc[:500]

### Grid search 

#### Linear 

In [None]:
cv = 5
param_grid = {'C': [0.1, 1, 10], 
              'penalty': ['l1', 'l2']}

scorers = {'accuracy': make_scorer(accuracy_score),
           'f1'      : make_scorer(f1_score, average = 'weighted')}
grid = GridSearchCV(LinearSVC(max_iter = 2000, loss = 'hinge'), param_grid, refit='accuracy', verbose = 12, 
                    scoring = scorers, return_train_score=True, cv = cv, n_jobs = -1)

grid.fit(x_train, y_train.values.ravel())
best_svclin = grid.best_estimator_

In [None]:
# evaluate
best_svclin_res = round(pd.DataFrame(index = ['svc_lin'], data  = 
          {'acc mean': [grid.cv_results_['mean_test_accuracy'][grid.best_index_]],
           'acc 2sd': [2*grid.cv_results_['std_test_accuracy'][grid.best_index_]],
           'f1 mean': [grid.cv_results_['mean_test_f1'][grid.best_index_]],
           'f1 2sd' : [2*grid.cv_results_['std_test_f1'][grid.best_index_]]}),4)
best_svclin_res

In [None]:
# save  model
with open('./Pickles/best_svclin.pkl', 'wb') as handle:
    pickle.dump(best_svclin, handle)
    pickle.dump(best_svclin_res, handle)

In [None]:
# load model
with open(pkls + 'best_svclin.pkl', 'rb') as handle:
    best_svclin = pickle.load(handle)
    best_svclin_res = pickle.load(handle)

#### RBF

In [None]:
cv = 5
param_grid = {'C': [0.1, 1, 10], 
              'gamma': [1, 0.1, 0.01]}

scorers = {'accuracy': make_scorer(accuracy_score),
           'f1'      : make_scorer(f1_score, average = 'weighted')}
grid = GridSearchCV(SVC(probability = True, kernel = 'rbf'), param_grid, refit='accuracy', verbose = 12, 
                    scoring = scorers, return_train_score=True, cv = cv, n_jobs = -1)

grid.fit(x_train, y_train.values.ravel())
best_svc = grid.best_estimator_

In [None]:
# evaluate
best_svc_res = round(pd.DataFrame(index = ['svc_rbf'], data  = 
          {'acc mean': [grid.cv_results_['mean_test_accuracy'][grid.best_index_]],
           'acc 2sd': [2*grid.cv_results_['std_test_accuracy'][grid.best_index_]],
           'f1 mean': [grid.cv_results_['mean_test_f1'][grid.best_index_]],
           'f1 2sd' : [2*grid.cv_results_['std_test_f1'][grid.best_index_]]}),4)
best_svc_res

In [None]:
# save  model
with open('./Pickles/best_svc.pkl', 'wb') as handle:
    pickle.dump(best_svc, handle)
    pickle.dump(best_svc_res, handle)

In [None]:
# load model
with open(pkls + 'best_svc.pkl', 'rb') as handle:
    best_svc = pickle.load(handle)
    best_svc_res = pickle.load(handle)

### Model inspection

In [None]:
# test prediction for some observation
obsno = 100
someobs = x_train.values[obsno]

y_train.values[obsno] # actual
best_svc.predict([someobs]) # predicted
best_svc.decision_function([someobs]) # score, 1 for binary, more for multiclass

In [None]:
# permutation importance
from sklearn.inspection import permutation_importance

n_repeats = 2 
feaimpo = permutation_importance(best_svc, x_train, y_train, n_repeats=n_repeats,
                                random_state=42, n_jobs = -1)
feaimpo.importances

In [None]:
with open('./Pickles/best_svc_importances.pkl', 'wb') as handle:
    pickle.dump(feaimpo, handle)

In [None]:
with open(pkls + 'best_svc_importances.pkl', 'rb') as handle:
    feaimpo = pickle.load(handle)

In [None]:
# plot feature permutation importances 
%matplotlib widget
result = feaimpo

fig, ax = plt.subplots(figsize = (8, 13), dpi = 100) # width, height
sorted_idx = result.importances_mean.argsort()
ax.boxplot(result.importances[sorted_idx].T, vert=False, 
           labels=x_train.columns[sorted_idx])
ax.set_title("Permutation Importances - SVC")
ax.set_ylabel("Features")
fig.tight_layout()
plt.show()

In [None]:
from scipy.stats import spearmanr
from scipy.cluster import hierarchy

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
corr = spearmanr(x_train).correlation
corr_linkage = hierarchy.ward(corr)
dendro = hierarchy.dendrogram(corr_linkage, labels=x_train.columns, ax=ax1,
                              leaf_rotation=90)
dendro_idx = np.arange(0, len(dendro['ivl']))

ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()

Next, we manually pick a threshold by visual inspection of the dendrogram to group our features into clusters and choose a feature from each cluster to keep, select those features from our dataset, and train a new random forest. The test accuracy of the new random forest did not change much compared to the random forest trained on the complete dataset.

In [None]:
from collections import defaultdict

threshold = 1
cluster_ids = hierarchy.fcluster(corr_linkage, t = threshold, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)
cluster_id_to_feature_ids
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

x_train_np = x_train.to_numpy()
x_train_sel = x_train_np[:, selected_features]
# X_test_sel = X_test[:, selected_features]

best_svc.fit(x_train_sel, y_train.values.ravel())


# round(pd.DataFrame(index = ['svc'], data  = 
#           {'acc mean': [grid.cv_results_['mean_test_accuracy'][grid.best_index_]],
#            'acc 2sd': [2*grid.cv_results_['std_test_accuracy'][grid.best_index_]],
#            'f1 mean': [grid.cv_results_['mean_test_f1'][grid.best_index_]],
#            'f1 2sd' : [2*grid.cv_results_['std_test_f1'][grid.best_index_]]}),4)

print("Accuracy with features removed: {:.2f}".format(
    best_svc.score(x_train_sel, y_train)))

In [None]:
# permutation importance 2
from sklearn.inspection import permutation_importance

n_repeats = 2 
feaimpo2 = permutation_importance(best_svc, x_train_sel, y_train, n_repeats=n_repeats,
                                random_state=42, n_jobs = -1)
# feaimpo2.importances

In [None]:
# plot feature permutation importances 
%matplotlib widget
result = feaimpo2

fig, ax = plt.subplots(figsize = (8, 13), dpi = 100) # width, height
sorted_idx = result.importances_mean.argsort()
ax.boxplot(result.importances[sorted_idx].T, vert=False) 
#            ,labels=x_train_sel.columns[sorted_idx])
ax.set_title("Permutation Importances - SVC")
ax.set_ylabel("Features")
fig.tight_layout()
plt.show()