In [1]:
import warnings
from pathlib import Path, PureWindowsPath

warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import mne
import numpy as np
import pandas as pd
import scipy.io
from sklearn.ensemble import IsolationForest

  from numpy.core.umath_tests import inner1d


In [2]:
# Set file paths:
file_path = str(Path(r'eeg-data/601/Rew_601_rest_bb_epoch.set'))
mat_reject = str(Path(r'eeg-data/601/Rew_601_rest_reject_rmm.mat'))
mat_stage = str(Path(r'eeg-data/601/Rew_601_rest_stages.mat'))

In [3]:
# Load epochs file:
try:
    epochs = mne.io.read_epochs_eeglab(file_path)
except:
    epochs = mne.io.read_raw_eeglab(file_path)

# Load sleep stages & reject files:
try:
    sleep_file = scipy.io.loadmat(mat_stage)
    sleep = sleep_file['stages'].flatten()
except FileNotFoundError:
    pass
finally:
    reject_file = scipy.io.loadmat(mat_reject)
    reject = reject_file['reject'].flatten()

At least one epoch has multiple events. Only the latency of the first event will be retained.
Extracting parameters from eeg-data/601/Rew_601_rest_bb_epoch.set...
6476 matching events found
No baseline correction applied
Not setting metadata
0 projection items activated
Ready.


In [4]:
# Convert to and clean DataFrame:
df = epochs.to_data_frame()
columns, df = sorted(list(df.columns)), df.reset_index()

try: 
    df = df.drop(['condition'], axis=1)
except:
    pass

cleaned_columns = ['time']
if 'epoch' in list(df.columns):
    cleaned_columns += ['epoch']
cleaned_columns += columns

df = df[cleaned_columns]
df_ = df.copy();

Converting time column to int64...


In [5]:
df_.head(5)

signal,time,epoch,C3,C4,CZ,F3,F4,F7,F8,FZ,...,Fp2,O1,O2,P3,P4,PZ,T3,T4,T5,T6
0,0,0,-4.101171,-5.968149,-4.743762,-3.682605,-4.446309,-1.441384,-5.15716,-2.841753,...,-5.491191,-5.813361,-10.619266,-5.897866,-7.994251,-6.512607,-2.072411,-2.857092,-1.348035,-6.712998
1,2,0,-5.183039,-3.539983,-4.891985,-5.510036,-2.73158,-4.127728,-2.581909,-2.712256,...,-4.066828,-8.367174,-10.431769,-7.622222,-6.328388,-7.43875,-3.432891,-0.377738,-3.379942,-5.844828
2,5,0,-6.473076,-1.935451,-5.402406,-7.127059,-1.392381,-6.066678,-0.504993,-2.933984,...,-2.69985,-10.873877,-10.9336,-9.510497,-5.507238,-8.70678,-4.636084,1.354425,-5.225135,-5.669819
3,8,0,-8.008626,-1.666145,-6.403526,-8.324136,-0.666902,-6.764383,0.725579,-3.599645,...,-1.47836,-13.169887,-12.535002,-11.537941,-6.059763,-10.455081,-5.518118,1.881818,-6.69757,-6.624831
4,10,0,-9.597174,-2.770913,-7.707548,-8.924362,-0.604117,-6.133142,0.986625,-4.461777,...,-0.552566,-14.950358,-15.219418,-13.436047,-8.015812,-12.51721,-5.916834,1.181734,-7.626414,-8.737988


In [6]:
# Select values from columns for IForest:
value_columns = list(df.columns)

try:
    if 'time' in value_columns:
        value_columns.remove('time')
    if 'epoch' in value_columns:
        value_columns.remove('epoch')
except:
    pass

df_values = df_[value_columns]

In [7]:
df_values.head(5)

signal,C3,C4,CZ,F3,F4,F7,F8,FZ,Fp1,Fp2,O1,O2,P3,P4,PZ,T3,T4,T5,T6
0,-4.101171,-5.968149,-4.743762,-3.682605,-4.446309,-1.441384,-5.15716,-2.841753,-3.508248,-5.491191,-5.813361,-10.619266,-5.897866,-7.994251,-6.512607,-2.072411,-2.857092,-1.348035,-6.712998
1,-5.183039,-3.539983,-4.891985,-5.510036,-2.73158,-4.127728,-2.581909,-2.712256,-3.882948,-4.066828,-8.367174,-10.431769,-7.622222,-6.328388,-7.43875,-3.432891,-0.377738,-3.379942,-5.844828
2,-6.473076,-1.935451,-5.402406,-7.127059,-1.392381,-6.066678,-0.504993,-2.933984,-4.071187,-2.69985,-10.873877,-10.9336,-9.510497,-5.507238,-8.70678,-4.636084,1.354425,-5.225135,-5.669819
3,-8.008626,-1.666145,-6.403526,-8.324136,-0.666902,-6.764383,0.725579,-3.599645,-3.855127,-1.47836,-13.169887,-12.535002,-11.537941,-6.059763,-10.455081,-5.518118,1.881818,-6.69757,-6.624831
4,-9.597174,-2.770913,-7.707548,-8.924362,-0.604117,-6.133142,0.986625,-4.461777,-3.055052,-0.552566,-14.950358,-15.219418,-13.436047,-8.015812,-12.51721,-5.916834,1.181734,-7.626414,-8.737988


In [None]:
Run IForest:
X = df_values
clfIF = IsolationForest(random_state=42, contamination=0.00001, n_jobs=3)
clfIF.fit(X)
pred_train, pred_test = clfIF.predict(X), clfIF.predict(X)
count_train, count_test = np.unique(ar=pred_train, return_counts=True), np.unique(ar=pred_test, return_counts=True)
index_train, index_test = [i for i,x in enumerate(pred_train) if x == -1] , [i for i,x in enumerate(pred_test) if x == -1]
df_IF = df_.loc[index_test]
num_anomalies = count_train[1][0], count_test[1][0]; total_pts = count_train[1][1], count_test[1][1]

In [None]:
print(f"{num_anomalies} artifacts detected")
print(f"{total_artifacts} total_artifacts")