In [17]:
import numpy as np
import pandas as pd
import random
import os
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from joblib import dump
from tqdm.notebook import tqdm

import itertools
from sklearn.utils import shuffle
from scipy import signal
%matplotlib inline


from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectFdr, chi2

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score

from utils.svm import preProcess, evaluate_set
from utils.visualize import showMe
from utils.augment import augment
from config.default import *


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
root_path = 'C:/resources/EMG/'
post_fix = '_1s_cleaned' #'_1s_new' #
classes = settings['classes']


sessions_to_val = [] #['session_4'] # ['session_1','session_2','session_3','session_4']    #[] # 
include = ['S002', 'S004', 'S005', 'S006', 'S007', 'S008', 'S009', 'S101', 'S102']   #['S101', 'S102'] #
# use session4 for validation
train_sessions = []
val_sessions = []
for subject in os.listdir(root_path):
    #if subject not in exclude:
    if subject in include:
        for session in os.listdir(os.path.join(root_path,subject)):
            if session in sessions_to_val:
                val_sessions.append(os.path.join(root_path,subject, session))
            else:
                train_sessions.append(os.path.join(root_path,subject, session))
        #print(f"{len(os.listdir(os.path.join(root_path,subject)))} session loaded from subject: {subject}")




train_records = {}
if len(train_sessions) > 0:
    for c in classes:
        class_data = []
        for session in train_sessions:
            data = np.load(os.path.join(session,c+post_fix+'.npy'),allow_pickle=True)
            if data.shape[0] != 0:
                class_data.append(data)
            else:
                #val_records[c] = np.random.rand(1, 4, 500)
                #print(f'WARNING! CREATING RANDOM DATA FOR {c}')
                print(f"No data available for train for class {c}")
        
        train_records[c] = np.concatenate(class_data)
    print(f"{len(train_sessions)} sessions loaded for training")
else:
    print("No train session available")

val_records = {}
for c in classes:
    class_data = []
    for session in val_sessions:
        data = np.load(os.path.join(session,c+post_fix+'.npy'),allow_pickle=True)
        if data.shape[0] != 0:
            class_data.append(data)
    if len(class_data) != 0:
        val_records[c] = np.concatenate(class_data)
    else:
        #val_records[c] = np.random.rand(1, 4, 500)
        #print(f'WARNING! CREATING RANDOM DATA FOR {c}')
        print(f"No data available for validation for class {c}")

print(f"{len(val_sessions)} sessions loaded for validation")

No data available for train for class Chew
No data available for train for class Chew
No data available for train for class Chew
No data available for train for class Smile
No data available for train for class Smile
No data available for train for class Smile
No data available for train for class Smile
No data available for train for class Smile
No data available for train for class Smile
33 sessions loaded for training
No data available for validation for class Rest
No data available for validation for class Eyebrow
No data available for validation for class Chew
No data available for validation for class Smile
0 sessions loaded for validation


In [19]:
print("TRAIN SET")
for r in train_records:
    print(f'{r} -> {train_records[r].shape}')

print("VAL SET")
for r in val_records:
    print(f'{r} -> {val_records[r].shape}')

TRAIN SET
Rest -> (1515, 4, 500)
Eyebrow -> (1887, 4, 500)
Chew -> (1281, 4, 500)
Smile -> (813, 4, 500)
VAL SET


In [20]:
def create_labels(X):
    y = []
    for i, r in enumerate(X):
        l = np.ones(X[r].shape[0])*i
        y = y + l.tolist()
    y = np.array(y)
    return y


In [21]:
n_channels = train_records["Rest"].shape[1]
input_length = train_records["Rest"].shape[2]


print('Train')
train_y = create_labels(train_records)
train_X = np.concatenate((list(train_records.values())), axis=0)
print(train_X.shape)
print(train_y.shape)


print('Validation:')
val_y = create_labels(val_records)
val_X = np.concatenate((list(val_records.values())), axis=0)
print(val_X.shape)
print(val_y.shape)

Train
(5496, 4, 500)
(5496,)
Validation:


ValueError: need at least one array to concatenate

In [22]:
# Reshape to SVM
train_X = train_X.reshape(train_X.shape[0], n_channels*input_length)
val_X = val_X.reshape(val_X.shape[0], n_channels*input_length)
print(train_X.shape)
print(val_X.shape)


(5496, 2000)
(1266, 2000)


In [23]:
# Shuffle
c = list(zip(train_X, train_y))
random.seed(42)
random.shuffle(c)
train_X, train_y = zip(*c)
train_X = np.array(train_X)
train_y = np.array(train_y)

print(train_X.shape)
print(train_y.shape)


(5496, 2000)
(5496,)


In [9]:
X, y = augment(X, y)
print("Shapes After augmentation")
print(X.shape)
print(y.shape)

Shapes After augmentation
(1800, 2000)
(1800,)


In [24]:
# SMALLER C -> better fit
# HIGHER gamma -> better fit
#param_grid = {'C': [1, 10, 100,1000], 'gamma': [1,0.1,0.01,0.001,0.0001]} #acc 88 test acc 45
#param_grid = {'C': [100,1000], 'gamma': [0.01,0.001,0.0001]} #slow

#param_grid = {'C': [100000,1000000], 'gamma': [0.000001,0.0000001]} 
param_grid = {'C': [10], 'gamma': [0.01]}

In [25]:
accs = []
models = []
def grid(X_train,y_train, X_test, y_test):
    grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
    grid.fit(X_train,y_train)

    grid_predictions = grid.predict(X_test)
    acc = accuracy_score(y_test,grid_predictions)
    accs.append(acc)
    models.append(grid.best_estimator_)
    # if acc > 0.9:
    #     return True
    # else:
    #     return False


skf = StratifiedKFold(n_splits=10,random_state= 42, shuffle = True)
for train, test in skf.split(train_X, train_y):
    
    X_train = train_X[train]
    y_train = train_y[train]
    X_test = train_X[test]
    y_test = train_y[test]

    grid(X_train,y_train, X_test, y_test)
    # if grid(X_train,y_train, X_test, y_test):
    #     break
    break
    

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=10, gamma=0.01; total time=   8.8s
[CV] END ...................................C=10, gamma=0.01; total time=   9.2s
[CV] END ...................................C=10, gamma=0.01; total time=   9.2s
[CV] END ...................................C=10, gamma=0.01; total time=   8.7s
[CV] END ...................................C=10, gamma=0.01; total time=   8.7s


In [26]:
model = models[accs.index(max(accs))]
for acc in accs:
    print(acc)

0.9418181818181818


In [27]:
evaluate_set(model, train_sessions, classes, post_fix, log = False)

  0%|          | 0/33 [00:00<?, ?it/s]

No data for class Smile
No data for class Smile
No data for class Smile
No data for class Chew
No data for class Smile
No data for class Smile
No data for class Chew
No data for class Chew
No data for class Smile
Global accuracy: 98.24%
         Accuracy
Subject          
S002        96.00
S004        96.00
S005        98.50
S006        99.50
S007        96.25
S008        98.25
S009        98.75
S101        99.50
S102        99.75


In [15]:
evaluate_set(model, val_sessions, classes, post_fix, log = False)

  0%|          | 0/8 [00:00<?, ?it/s]

No data for class Smile
Global accuracy: 81.88%
         Accuracy
Subject          
S004           65
S005           83
S006           74
S007           82
S008           82
S009           87
S101           91
S102           91


In [28]:
dump(model, 'saved_models/svm_9subj_no_val.joblib') 

['saved_models/svm_9subj_no_val.joblib']