In [7]:
import mne
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, make_scorer, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
%matplotlib inline

path_out = 'C:/skoltech_hand_writing'
subj = 'S102'


In [8]:
epochs_list_concated = mne.read_epochs(f'{path_out}/hand_writing/{subj}/5_files_for_model/epochs_eeg_2-epo.fif', preload = True)
epochs_eeg=epochs_list_concated.copy().resample(1000)
epochs_eeg_init=epochs_list_concated.copy().resample(1000)

Reading C:\skoltech_hand_writing\hand_writing\S102\5_files_for_model\epochs_eeg_2-epo.fif ...
Isotrak not found
    Found the data of interest:
        t =   -2000.00 ...    7000.00 ms
        0 CTF compensation matrices available
Not setting metadata
210 matching events found
No baseline correction applied
0 projection items activated


In [9]:
# DEFINE CHANNELS TO PICK FOR THE MODEL IF DON'T WANT TO AVERAGE ALL EEG CHANNELS
# picks = [i for i in epochs_eeg.ch_names if 'L.vis' in i]
# picks = ['EEG L.vis_1', 'EEG L.vis_3', 'EEG L.lob_3', 'EEG L.lob_4', 'EEG L.lob_5', 'EEG L.lob_6', 'EEG L.lob_8', 'EEG R.vis_1',
        #  'EEG R.vis_5', 'EEG R.vis_6', 'EEG R.lob_6', 'EEG R.lob_7']
picks = [i for i in epochs_eeg.ch_names if 'EEG' in i]
epochs_filt = epochs_eeg.copy().pick(picks).filter(l_freq = 60, h_freq = 120)

Setting up band-pass filter from 60 - 1.2e+02 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 60.00
- Lower transition bandwidth: 15.00 Hz (-6 dB cutoff frequency: 52.50 Hz)
- Upper passband edge: 120.00 Hz
- Upper transition bandwidth: 30.00 Hz (-6 dB cutoff frequency: 135.00 Hz)
- Filter length: 221 samples (0.221 s)



[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 287 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 647 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 881 tasks      | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done 1151 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done 1457 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 2177 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done 2591 tasks      | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done 3041 tasks      | elapsed:    2.2s
[Parallel(n_jobs=1)]: Done 3527 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 4049 tasks      | elapsed:    2.7s
[Parallel(n_jobs=1)]: Done 4607 tasks      | elapsed:    3.0s
[Parallel(n_job

In [10]:
epochs_filt

0,1
Number of events,210
Events,0: 210
Time range,-2.000 – 6.999 s
Baseline,off


In [17]:
# define X (eeg data of shape (n_epochs, n_times)) and y (target numbers that had to be written)
X = pd.DataFrame(epochs_filt.copy().get_data().mean(axis = 1))
y = pd.read_csv(f'{path_out}/hand_writing/{subj}/Digits_trials_final_2.xlsx')['Digit_text'].values
# random train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

  X = pd.DataFrame(epochs_filt.copy().get_data().mean(axis = 1))


In [18]:
# PCA to reduce dimensionality
pca = PCA(n_components=30, random_state = 42)
pca.fit(X_train)
X_train_dec = pca.transform(X_train)
X_test_dec = pca.transform(X_test)
X_train_dec.shape, X_test_dec.shape

((168, 30), (42, 30))

### Make pipeline with a classifier, CV and grid search and metrics to find the best model

In [19]:
auc_scorer = make_scorer(
    score_func=roc_auc_score,
    needs_proba=True
)

estimators = [('gb', GradientBoostingClassifier(random_state = 42))]
param_grid = dict(gb__learning_rate = [.01, .1, 1, 10],
                  gb__n_estimators = [10, 50, 100],
                  gb__ccp_alpha = [0, .01, .1, 1],
                  gb__random_state = [42],
                  )
pipe = Pipeline(estimators)
grid_search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    scoring = 'roc_auc_ovo_weighted', # THIS CAN BE CHANGED    
    verbose=10
)
grid_search.fit(X_train_dec, y_train)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

best_estimator = grid_search.best_estimator_
best_estimator.fit(X_train_dec, y_train)
y_pred = best_estimator.predict_proba(X_test_dec)
print(f'ROC-AUC on a test sample: {roc_auc_score(y_test, y_pred, average='weighted', multi_class='ovo')}')



Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5; 1/48] START gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42
[CV 1/5; 1/48] END gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42;, score=0.492 total time=   0.1s
[CV 2/5; 1/48] START gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42
[CV 2/5; 1/48] END gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42;, score=0.432 total time=   0.2s
[CV 3/5; 1/48] START gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42
[CV 3/5; 1/48] END gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42;, score=0.534 total time=   0.1s
[CV 4/5; 1/48] START gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42
[CV 4/5; 1/48] END gb__ccp_alpha=0, gb__learning_rate=0.01, gb__n_estimators=10, gb__random_state=42;, score=0.518 total tim