# Other Methods

This notebook trains and evaluates some machine learning models other than deep learning.

-----

## Configurations

In [None]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ..

In [None]:
# Load some packages
import os
import glob
import json
import pprint
import time

import numpy as np
import random
from torch.utils.data import DataLoader
from torchvision import transforms

# custom package
from datasets.caueeg_dataset import *
from datasets.caueeg_script import *
from datasets.pipeline import *

In [None]:
# Data file path
data_path = r'local/dataset/02_Curated_Data_220419/'

In [None]:
config_data, train_dataset, val_dataset, test_dataset = load_caueeg_task_datasets(dataset_path=data_path, 
                                                                                  task='task2',
                                                                                  load_event=False, 
                                                                                  file_format='memmap', 
                                                                                  transform=None)

In [None]:
num_dist = np.zeros((len(train_dataset)), dtype=np.int32)

for i, data in enumerate(train_dataset):
    num_dist[i] = data['signal'].shape[1] // 2000

total_num = np.sum(num_dist)
X_train = np.zeros((total_num, 21, 2000), dtype=np.int32)
y_train = np.zeros((total_num), dtype=np.int32)

current = 0

for i, data in enumerate(train_dataset):
    arrs = np.split(data['signal'], [2000 * (k + 1) for k in range(data['signal'].shape[1] // 2000)], axis=1)
    arrs.pop()
        
    X_train[current:current + num_dist[i]] = np.array(arrs)
    y_train[current:current + num_dist[i]] = data['class_label']
    
    current = current + num_dist[i]
    
print(X_train.shape)
print(y_train.shape)

In [None]:
num_dist = np.zeros((len(val_dataset)), dtype=np.int32)

for i, data in enumerate(val_dataset):
    num_dist[i] = data['signal'].shape[1] // 2000

total_num = np.sum(num_dist)
X_val = np.zeros((total_num, 21, 2000), dtype=np.int32)
y_val = np.zeros((total_num), dtype=np.int32)

current = 0

for i, data in enumerate(val_dataset):
    arrs = np.split(data['signal'], [2000 * (k + 1) for k in range(data['signal'].shape[1] // 2000)], axis=1)
    arrs.pop()
        
    X_val[current:current + num_dist[i]] = np.array(arrs)
    y_val[current:current + num_dist[i]] = data['class_label']
    
    current = current + num_dist[i]
    
print(X_val.shape)
print(y_val.shape)

In [None]:
num_dist = np.zeros((len(test_dataset)), dtype=np.int32)

for i, data in enumerate(test_dataset):
    num_dist[i] = data['signal'].shape[1] // 2000

total_num = np.sum(num_dist)
X_test = np.zeros((total_num, 21, 2000), dtype=np.int32)
y_test = np.zeros((total_num), dtype=np.int32)

current = 0

for i, data in enumerate(test_dataset):
    arrs = np.split(data['signal'], [2000 * (k + 1) for k in range(data['signal'].shape[1] // 2000)], axis=1)
    arrs.pop()
        
    X_test[current:current + num_dist[i]] = np.array(arrs)
    y_test[current:current + num_dist[i]] = data['class_label']
    
    current = current + num_dist[i]
    
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_val = X_val.reshape(X_val.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_tes = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, accuracy_score
import pickle
import sys

classifiers = [
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=7),
    SVC(kernel="linear", max_iter=1000),
    SVC(kernel="rbf", max_iter=1000),
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(n_estimators=500),
    RandomForestClassifier(n_estimators=1000),
    RandomForestClassifier(n_estimators=3000),
    RandomForestClassifier(n_estimators=3000, max_depth=20),
    # MLPClassifier(),
    # MLPClassifier(alpha=1e-3, max_iter=10000),
    # GradientBoostingClassifier(),
    # AdaBoostClassifier(),
    # GaussianNB(),
    # QuadraticDiscriminantAnalysis(),
]

for clf in classifiers:
    print(clf)
    clf.fit(X_train, y_train)    
    print(clf.score(X_train, y_train))
    print(clf.score(X_val, y_val))

    # print(clf.score(X_test, y_test))
    # Note that in binary classification, recall of the positive class is also known as “sensitivity”; recall of the negative class is “specificity”.
    tic = time.perf_counter()
    y_pred = clf.predict(X_test)
    toc = time.perf_counter()
    throughput = X_test.shape[0] / (toc - tic)
    
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=config_data['class_label_to_name']))
    print('model size (byte):', sys.getsizeof(pickle.dumps(clf)))
    print('test time:', throughput)
    print()
    print('-----' * 3)
    print()

In [None]:
# # Code source: Gaël Varoquaux
# #              Andreas Müller
# # Modified for documentation by Jaques Grobler
# # License: BSD 3 clause

# import numpy as np
# import matplotlib.pyplot as plt
# from matplotlib.colors import ListedColormap
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_moons, make_circles, make_classification
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.inspection import DecisionBoundaryDisplay

# plt.style.use('default') 
# plt.style.use('fivethirtyeight') # default, ggplot, fivethirtyeight, bmh, dark_background, classic
# plt.rcParams.update({'font.size': 11})
# plt.rcParams.update({'font.family': 'Arial'})

# names = [
#     "Nearest Neighbors",
#     "Linear SVM",
#     "RBF SVM",
#     "Gaussian Process",
#     "Decision Tree",
#     "Random Forest",
#     "Neural Net",
#     "AdaBoost",
#     "Naive Bayes",
#     "QDA",
# ]

# classifiers = [
#     KNeighborsClassifier(3),
#     SVC(kernel="linear", C=0.025),
#     SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
#     DecisionTreeClassifier(max_depth=5),
#     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     MLPClassifier(alpha=1, max_iter=1000),
#     AdaBoostClassifier(),
#     GaussianNB(),
#     QuadraticDiscriminantAnalysis(),
# ]

# X, y = make_classification(
#     n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
# )

# print(X.shape, y.shape)

# rng = np.random.RandomState(2)
# X += 2 * rng.uniform(size=X.shape)
# linearly_separable = (X, y)

# datasets = [
#     make_moons(noise=0.3, random_state=0),
#     make_circles(noise=0.2, factor=0.5, random_state=1),
#     linearly_separable,
# ]

# figure = plt.figure(figsize=(27, 9))
# i = 1
# # iterate over datasets
# for ds_cnt, ds in enumerate(datasets):
#     # preprocess dataset, split into training and test part
#     X, y = ds
#     X = StandardScaler().fit_transform(X)
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.4, random_state=42
#     )

#     x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
#     y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

#     # just plot the dataset first
#     cm = plt.cm.RdBu
#     cm_bright = ListedColormap(["#FF0000", "#0000FF"])
#     ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
#     if ds_cnt == 0:
#         ax.set_title("Input data")
#     # Plot the training points
#     ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
#     # Plot the testing points
#     ax.scatter(
#         X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
#     )
#     ax.set_xlim(x_min, x_max)
#     ax.set_ylim(y_min, y_max)
#     ax.set_xticks(())
#     ax.set_yticks(())
#     i += 1

#     # iterate over classifiers
#     for name, clf in zip(names, classifiers):
#         ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
#         clf.fit(X_train, y_train)
#         score = clf.score(X_test, y_test)
#         DecisionBoundaryDisplay.from_estimator(
#             clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
#         )

#         # Plot the training points
#         ax.scatter(
#             X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
#         )
#         # Plot the testing points
#         ax.scatter(
#             X_test[:, 0],
#             X_test[:, 1],
#             c=y_test,
#             cmap=cm_bright,
#             edgecolors="k",
#             alpha=0.6,
#         )

#         ax.set_xlim(x_min, x_max)
#         ax.set_ylim(y_min, y_max)
#         ax.set_xticks(())
#         ax.set_yticks(())
#         if ds_cnt == 0:
#             ax.set_title(name)
#         ax.text(
#             x_max - 0.3,
#             y_min + 0.3,
#             ("%.2f" % score).lstrip("0"),
#             size=15,
#             horizontalalignment="right",
#         )
#         i += 1

# plt.tight_layout()
# plt.show()