In [1]:
import mne
from mne.io import concatenate_raws, read_raw_edf
import glob

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mne.decoding import CSP
from sklearn.preprocessing import LabelEncoder

from mne.decoding import (
    CSP,
)
import numpy as np

from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
subjects = [1]
recordings = [1,1]

In [4]:
def get_data(subject=1):
    # pad with 0 if < len() == 2
    subject = f"{subject:02d}"
    to_exclude = {'EOG horizontal': 'eog', 'Resp oro-nasal': 'misc', 'EMG submental': 'misc', 'Temp rectal': 'misc', 'Event marker': 'misc'}
    files_data = glob.glob(f'./files/SC4{subject}*-PSG.edf')
    files_anot = glob.glob(f'./files/SC4{subject}*-Hypn*.edf')
    raws = []

    if len(files_data) != len(files_anot):
        raise ValueError('Number of PSG and Hypnogram files do not match.')

    if len(files_data) == 0:
        raise ValueError('No PSG or Hypnogram files found.')
    
    for i in range(1):
        raw = read_raw_edf(files_data[i], exclude=to_exclude, preload=True)
        annot = mne.read_annotations(files_anot[i])
        raw.set_annotations(annot, emit_warning=False)
        raws.append(raw)

    raws = concatenate_raws(raws)
    return raws

In [5]:
raw = get_data(subject=1)

Extracting EDF parameters from /Users/owalid/42/post_intership/total-perspective-vortex/notebooks/other_datasets/sleep_edf/files/SC4011E0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


Reading 0 ... 8405999  =      0.000 ... 84059.990 secs...


In [7]:
# Band-pass filter keep only alpha and beta waves
low_cutoff = 8
high_cutoff = 30
raw.filter(low_cutoff, high_cutoff, fir_design='firwin')

events, event_dict = mne.events_from_annotations(raw)

Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 8 - 30 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 8.00
- Lower transition bandwidth: 2.00 Hz (-6 dB cutoff frequency: 7.00 Hz)
- Upper passband edge: 30.00 Hz
- Upper transition bandwidth: 7.50 Hz (-6 dB cutoff frequency: 33.75 Hz)
- Filter length: 165 samples (1.650 sec)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


Used Annotations descriptions: ['Sleep stage 1', 'Sleep stage 2', 'Sleep stage 3', 'Sleep stage 4', 'Sleep stage ?', 'Sleep stage R', 'Sleep stage W']


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s finished


In [8]:
event_dict

{'Sleep stage 1': 1,
 'Sleep stage 2': 2,
 'Sleep stage 3': 3,
 'Sleep stage 4': 4,
 'Sleep stage ?': 5,
 'Sleep stage R': 6,
 'Sleep stage W': 7}

In [11]:
new_labels_events = {
    1: 'sleep_1',
    2: 'sleep_2',
    3: 'sleep_3',
    4: 'sleep_4',
    5: 'sleep_?',
    6: 'sleep_REM',
    7: 'sleep_WAKE',
}

new_annot = mne.annotations_from_events(events, event_desc=new_labels_events, sfreq=raw.info['sfreq'])
raw.set_annotations(new_annot)

0,1
Measurement date,"March 29, 1989 16:49:00 GMT"
Experimenter,Unknown
Digitized points,Not available
Good channels,2 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,100.00 Hz
Highpass,8.00 Hz
Lowpass,30.00 Hz


In [12]:
events, event_dict = mne.events_from_annotations(raw)
event_dict

Used Annotations descriptions: ['sleep_1', 'sleep_2', 'sleep_3', 'sleep_4', 'sleep_?', 'sleep_REM', 'sleep_WAKE']


{'sleep_1': 1,
 'sleep_2': 2,
 'sleep_3': 3,
 'sleep_4': 4,
 'sleep_?': 5,
 'sleep_REM': 6,
 'sleep_WAKE': 7}

0 => N1: light sleep

1 => N2: deeper sleep

2 => N3: deep sleep

3 => N4: deep sleep

4 => N5: unknown

5 => R: rapid eye movement

6 => W: wakefulness

In [15]:
events_to_select = {
    'sleep_1': 1,
    'sleep_2': 2,
    'sleep_3': 3,
    'sleep_4': 4,
    'sleep_REM': 6,
    'sleep_WAKE': 7,
}
# remove unknown stages

In [16]:
epochs = mne.Epochs(raw, events, event_id=events_to_select, tmin=-0.5, tmax=0.5, preload=True)

Not setting metadata
125 matching events found
Setting baseline interval to [-0.5, 0.0] sec
Applying baseline correction (mode: mean)
0 projection items activated
Using data from preloaded Raw for 125 events and 101 original time points ...
1 bad epochs dropped


In [21]:
X = epochs.get_data()
y = epochs.events[:, -1] - 1
y

array([0, 1, 0, 1, 0, 6, 0, 1, 0, 6, 0, 1, 2, 1, 2, 1, 2, 3, 2, 3, 2, 3,
       2, 3, 2, 3, 2, 1, 6, 0, 1, 6, 0, 1, 6, 0, 1, 0, 1, 5, 1, 2, 1, 2,
       1, 2, 3, 2, 3, 1, 0, 1, 0, 1, 0, 6, 0, 6, 0, 1, 6, 0, 6, 0, 6, 0,
       6, 0, 1, 0, 1, 5, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       3, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 5, 1, 6, 0, 1, 5, 0, 5, 0, 5,
       1, 0, 1, 0, 1, 0, 1, 6, 0, 1, 5, 0, 5, 6])

In [22]:
le = LabelEncoder()
y = le.fit_transform(y)

In [24]:
models = [
    ('Gradient Boosting', GradientBoostingClassifier(), {'model__n_estimators': [50, 100]}),
    ('Linear discriminant analysis', LinearDiscriminantAnalysis(), {'model__solver': ['svd', 'lsqr', 'eigen'], 'model__tol': [0.0001, 0.00001]}),   
    ('SVM', SVC(), {'model__C': [0.5, 1, 3], 'model__kernel': ['linear']}),
    ('KNN', KNeighborsClassifier(), {'model__n_neighbors': [4,5,6]}),
    ('Random Forest', RandomForestClassifier(), {'model__n_estimators': [50,100]}),
    ('MLP', MLPClassifier(), {'model__hidden_layer_sizes': [(100, 50), (200, 100)]}),
    ('Decision Tree', DecisionTreeClassifier(), {'model__max_depth': [50, 100]}),
]

In [25]:
shuffle_split = ShuffleSplit(n_splits=7, test_size=0.2, random_state=42)
pipelines = []
csp = CSP()
for name, model, param_grid in models:
    pipeline = Pipeline([
        ('csp', csp),
        ('model', model)
    ])
    param_grid['csp__n_components'] = [5, 6, 7, 8, 9, 10, 15, 20, 30, 40]
    pipelines.append((name, pipeline, param_grid))

In [None]:
results = []
for name, pipeline, param_grid in pipelines:
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=shuffle_split, n_jobs=-1)
    grid_search.fit(X, y)
    results.append((name, grid_search))

In [27]:
res_grid = []
for name, grid_search in results:
    print(f"Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.2f}")
    print("\n")

Model: Gradient Boosting
Best Parameters: {'csp__n_components': 5, 'model__n_estimators': 50}
Best Cross-Validated Accuracy: 0.30


Model: Linear discriminant analysis
Best Parameters: {'csp__n_components': 5, 'model__solver': 'svd', 'model__tol': 0.0001}
Best Cross-Validated Accuracy: 0.33


Model: SVM
Best Parameters: {'csp__n_components': 5, 'model__C': 1, 'model__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.36


Model: KNN
Best Parameters: {'csp__n_components': 5, 'model__n_neighbors': 5}
Best Cross-Validated Accuracy: 0.34


Model: Random Forest
Best Parameters: {'csp__n_components': 30, 'model__n_estimators': 100}
Best Cross-Validated Accuracy: 0.32


Model: MLP
Best Parameters: {'csp__n_components': 7, 'model__hidden_layer_sizes': (100, 50)}
Best Cross-Validated Accuracy: 0.37


Model: Decision Tree
Best Parameters: {'csp__n_components': 8, 'model__max_depth': 50}
Best Cross-Validated Accuracy: 0.29




In [37]:
y = epochs.events[:, -1] - 1
y = np.where(y < 6, 0, 1)
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
results = []
for name, pipeline, param_grid in pipelines:
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=shuffle_split, n_jobs=-1)
    grid_search.fit(X, y)
    results.append((name, grid_search))

In [39]:
res_grid = []
for name, grid_search in results:
    print(f"Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.2f}")
    print("\n")

Model: Gradient Boosting
Best Parameters: {'csp__n_components': 6, 'model__n_estimators': 50}
Best Cross-Validated Accuracy: 0.89


Model: Linear discriminant analysis
Best Parameters: {'csp__n_components': 5, 'model__solver': 'svd', 'model__tol': 0.0001}
Best Cross-Validated Accuracy: 0.89


Model: SVM
Best Parameters: {'csp__n_components': 5, 'model__C': 1, 'model__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.89


Model: KNN
Best Parameters: {'csp__n_components': 5, 'model__n_neighbors': 5}
Best Cross-Validated Accuracy: 0.90


Model: Random Forest
Best Parameters: {'csp__n_components': 30, 'model__n_estimators': 100}
Best Cross-Validated Accuracy: 0.90


Model: MLP
Best Parameters: {'csp__n_components': 40, 'model__hidden_layer_sizes': (100, 50)}
Best Cross-Validated Accuracy: 0.90


Model: Decision Tree
Best Parameters: {'csp__n_components': 10, 'model__max_depth': 100}
Best Cross-Validated Accuracy: 0.87




In [31]:
# Classify with sleep(1,2,3,4,5), rapidmovement(6), wakefull (7)
y = epochs.events[:, -1] - 1

# Classify with sleep(0,1,2,3,4), rapidmovement(6), wakefull (7)
y = epochs.events[:, -1] - 1
y = np.where(y < 5, 0, y)
y = np.where(y == 5, 1, y)
y = np.where(y == 6, 2, y)
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0,
       2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 2])

In [None]:
results = []
for name, pipeline, param_grid in pipelines:
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=shuffle_split, n_jobs=-1)
    grid_search.fit(X, y)
    results.append((name, grid_search))

In [33]:
res_grid = []
for name, grid_search in results:
    print(f"Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.2f}")
    print("\n")

Model: Gradient Boosting
Best Parameters: {'csp__n_components': 6, 'model__n_estimators': 50}
Best Cross-Validated Accuracy: 0.78


Model: Linear discriminant analysis
Best Parameters: {'csp__n_components': 5, 'model__solver': 'lsqr', 'model__tol': 0.0001}
Best Cross-Validated Accuracy: 0.81


Model: SVM
Best Parameters: {'csp__n_components': 5, 'model__C': 3, 'model__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.81


Model: KNN
Best Parameters: {'csp__n_components': 5, 'model__n_neighbors': 5}
Best Cross-Validated Accuracy: 0.81


Model: Random Forest
Best Parameters: {'csp__n_components': 5, 'model__n_estimators': 50}
Best Cross-Validated Accuracy: 0.81


Model: MLP
Best Parameters: {'csp__n_components': 5, 'model__hidden_layer_sizes': (100, 50)}
Best Cross-Validated Accuracy: 0.81


Model: Decision Tree
Best Parameters: {'csp__n_components': 15, 'model__max_depth': 50}
Best Cross-Validated Accuracy: 0.75


