For the feature extraction, I would like to express my gratitude to Tuur Smolders! He assisted me with epoching the data and explained to me why it's done in this particular manner. Furthermore, he also aided me in calculating the Frontal Alpha Asymmetry. Furthermore, I would like to extend my thanks to Anne van Duijvenbode for sharing her code on setting montage based on channel names and locations.

# Feature Extraction

In [None]:
pip install mne

Collecting mne
  Downloading mne-1.7.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mne
Successfully installed mne-1.7.0


In [None]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import mne
import os
import matplotlib.pyplot as plt
from mne.time_frequency import tfr_multitaper

%matplotlib inline

# prevent extensive logging
mne.set_log_level('WARNING')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the DataFrame from the pickle file
df_participants = pd.read_pickle('/content/drive/MyDrive/TD-BRAIN/TDBRAIN_participants_V2_data/df_participants.pkl')

# Drop the 'diagnosis_group' column
df_participants = df_participants.drop(columns=['diagnosis_group'])

# Print the updated DataFrame shape to confirm the column has been removed
print(f'all participants after removing diagnosis_group: {df_participants.shape}')

# Display a sample of the DataFrame to verify the change
df_participants.sample(5)


all participants after removing diagnosis_group: (370, 12)


Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,age,gender,sessID,nrSessions,EC,EO,diagnosis
250,sub-87974973,DISCOVERY,HEALTHY,HEALTHY,,24.1,1,1,1,True,True,HEALTHY
811,sub-88036037,DISCOVERY,MDD,MDD,MDD-rTMS,53.0,1,1,1,True,True,MDD
799,sub-88035229,DISCOVERY,MDD,UNKNOWN,,65.17,0,1,1,True,True,MDD
842,sub-88039773,DISCOVERY,MDD,MDD,MDD-rTMS,24.18,1,1,1,True,True,MDD
1024,sub-88052285,DISCOVERY,MDD,UNKNOWN,,51.53,0,1,1,True,True,MDD


In [None]:
# Print de oorspronkelijke vorm en unieke ID's
print("Voor aanpassing:")
print("Vorm van de DataFrame:", df_participants.shape)
print("Unieke IDs:", df_participants['participants_ID'].nunique())


Voor aanpassing:
Vorm van de DataFrame: (370, 12)
Unieke IDs: 362


In [None]:
## Set montage based on channel names and locations provided in Van Dijk et al., (2022). This code is copied from Anne van Duijvenbode.

ch_types = ['eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg',\
           'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', \
           'eog', 'eog', 'eog', 'eog', 'ecg', 'eog', 'emg']

ch_names = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4', 'T7', 'C3', 'Cz', 'C4', 'T8', 'CP3', \
            'CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'Oz', 'O2', 'VPVA', 'VNVB', 'HPHL', 'HNHR', 'Erbs', \
            'OrbOcc', 'Mass']

dict_eeg_channels =  {ch_names[i]: ch_types[i] for i in range(len(ch_types))}

dict_ch_pos = {'Fp1' : [-26.81, 84.06, -10.56],
               'Fp2' : [29.41, 83.74, -10.04],
               'F7'  : [-66.99, 41.69, -15.96],
               'F3'  : [-48.05, 51.87, 39.87],
               'Fz'  : [0.90, 57.01, 66.36],
               'F4'  : [50.38, 51.84, 41.33],
               'F8'  : [68.71, 41.16, -15.31],
               'FC3' : [-58.83, 21.02, 54.82],
               'FCz' : [0.57, 24.63, 87.63],
               'FC4' : [60.29, 21.16, 55.58],
               'T7'  : [-83.36, -16.52, -12.65],
               'C3'  : [-65.57, -13.25, 64.98],
               'Cz'  : [0.23, -11.28, 99.81],
               'C4'  : [66.50, -12.80, 65.11],
               'T8'  : [84.44, -16.65, -11.79],
               'CP3' : [-65.51, -48.48, 68.57],
               'CPz' : [-0.42, -48.77, 98.37],
               'CP4' : [65.03, -48.35, 68.57],
               'P7': [-71.46, -75.17, -3.70],
               'P3'  : [-55.07, -80.11, 59.44],
               'Pz'  : [-0.87, -82.23, 82.43],
               'P4'  : [53.51, -80.13, 59.40],
               'P8' : [71.10, -75.17, -3.69],
               'O1'  : [-28.98, -114.52, 9.67],
               'Oz'  : [-1.41, -117.79, 15.84],
               'O2'  : [26.89, -114.68, 9.45]
              }

dict_ch_pos_m = {'Fp1' : [-0.2681, 0.8406, -0.1056],
               'Fp2' : [0.2941, 0.8374, -0.1004],
               'F7'  : [-0.6699, 0.4169, -0.1596],
               'F3'  : [-0.4805, 0.5187, 0.3987],
               'Fz'  : [0.0090, 0.5701, 0.6636],
               'F4'  : [0.5038, 0.5184, 0.4133],
               'F8'  : [0.6871, 0.4116, -0.1531],
               'FC3' : [-0.5883, 0.2102, 0.5482],
               'FCz' : [0.0057, 0.2463, 0.8763],
               'FC4' : [0.6029, 0.2116, 0.5558],
               'T7'  : [-0.8336, -0.1652, -0.1265],
               'C3'  : [-0.6557, -0.1325, 0.6498],
               'Cz'  : [0.0023, -0.1128, 0.9981],
               'C4'  : [0.6650, -0.1280, 0.6511],
               'T8'  : [0.8444, -0.1665, -0.1179],
               'CP3' : [-0.6551, -0.4848, 0.6857],
               'CPz' : [-0.042, -0.4877, 0.9837],
               'CP4' : [0.6503, -0.4835, 0.6857],
               'P7'  : [-0.7146, -0.7517, -0.0370],
               'P3'  : [-0.5507, -0.8011, 0.5944],
               'Pz'  : [-0.0087, -0.8223, 0.8243],
               'P4'  : [0.5351, -0.8013, 0.5940],
               'P8'  : [0.7110, -0.7517, -0.0369],
               'O1'  : [-0.2898, -1.1452, 0.0967],
               'Oz'  : [-0.0141, -1.1779, 0.1584],
               'O2'  : [0.2689, -1.1468, 0.0945]
              }

dict_ch_pos_array = {'Fp1' : np.array([-0.02681, 0.08406, -0.01056]),
               'Fp2' : np.array([0.02941, 0.08374, -0.01004]),
               'F7'  : np.array([-0.06699, 0.04169, -0.01596]),
               'F3'  : np.array([-0.04805, 0.05187, 0.03987]),
               'Fz'  : np.array([0.00090, 0.05701, 0.06636]),
               'F4'  : np.array([0.05038, 0.05184, 0.04133]),
               'F8'  : np.array([0.06871, 0.04116, -0.01531]),
               'FC3' : np.array([-0.05883, 0.02102, 0.05482]),
               'FCz' : np.array([0.00057, 0.02463, 0.08763]),
               'FC4' : np.array([0.06029, 0.02116, 0.05558]),
               'T7'  : np.array([-0.08336, -0.01652, -0.01265]),
               'C3'  : np.array([-0.06557, -0.01325, 0.06498]),
               'Cz'  : np.array([0.000023, -0.01128, 0.09981]),
               'C4'  : np.array([0.06650, -0.01280, 0.06511]),
               'T8'  : np.array([0.08444, -0.01665, -0.01179]),
               'CP3' : np.array([-0.06551, -0.04848, 0.06857]),
               'CPz' : np.array([-0.0042, -0.04877, 0.09837]),
               'CP4' : np.array([0.06503, -0.04835, 0.06857]),
               'P7'  : np.array([-0.07146, -0.07517, -0.00370]),
               'P3'  : np.array([-0.05507, -0.08011, 0.05944]),
               'Pz'  : np.array([-0.00087, -0.08223, 0.08243]),
               'P4'  : np.array([0.05351, -0.08013, 0.05940]),
               'P8'  : np.array([0.07110, -0.07517, -0.00369]),
               'O1'  : np.array([-0.02898, -0.11452, 0.00967]),
               'Oz'  : np.array([-0.00141, -0.11779, 0.01584]),
               'O2'  : np.array([0.02689, -0.11468, 0.00945])
              }

# channel groupings
frontal = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4']
central = ['T7', 'C3', 'Cz', 'C4', 'T8']
parietal = ['CP3','CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8']
occipital = ['O1', 'Oz', 'O2']
channel_groups = {'frontal': frontal, 'central': central, 'parietal': parietal, 'occipital': occipital}

# define (5) frequencies of interest for TFR per frequency band
delta = np.array([1, 1.5, 2, 2.5, 3]) # starting at one because of high-pass filter
theta = np.array([4, 4.75, 5.5, 6.25, 7])
alpha = np.array([8, 9, 10, 11, 12])
beta = np.array([13, 17.25, 21.5, 25.75, 30])
gamma = np.array([42, 54, 66, 78, 90])
bands = {'delta': delta, 'theta': theta, 'alpha': alpha, 'beta': beta, 'gamma': gamma}


## Create montage
montage = mne.channels.make_dig_montage(ch_pos = dict_ch_pos_array, coord_frame = 'head')

# Create info object for MNE
info = mne.create_info(ch_names=ch_names, ch_types=ch_types, sfreq=500)
info.set_montage(montage=montage, on_missing= 'raise')
print(info)

<Info | 8 non-empty values
 bads: []
 ch_names: Fp1, Fp2, F7, F3, Fz, F4, F8, FC3, FCz, FC4, T7, C3, Cz, C4, T8, ...
 chs: 26 EEG, 5 EOG, 1 ECG, 1 EMG
 custom_ref_applied: False
 dig: 29 items (3 Cardinal, 26 EEG)
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: unspecified
 nchan: 33
 projs: []
 sfreq: 500.0 Hz
>


# Feature Extraction and storing in df

In [None]:
import scipy
# calculate variance in power per freq band and per channel group for each file and store in dataframe
eeg_dir = "/content/drive/MyDrive/TD-BRAIN/preprocessed"

sample_ids = df_participants['participants_ID'].tolist() # list of participants to include

df_ec_features = pd.DataFrame() # create empty dataframe to store EC features
df_eo_features = pd.DataFrame() # create empty dataframe to store EO features

# counter for progress
count = 1
if count == 1:
    total_files = 0
    for _, dirs, files in os.walk(eeg_dir):
        #dirs[:] = [d for d in dirs if d not in exlude_dirs] # exclude directories
        total_files += len([file for file in files if any(sample_id in file for sample_id in sample_ids) and '.npy' in file and 'ses-1' in file and 'BAD' not in file])

for subdir, dirs, files in os.walk(eeg_dir): # iterate through all files
    #dirs[:] = [d for d in dirs if d not in exlude_dirs] # exclude directories
    for file in files:
        if any(sample_id in file for sample_id in sample_ids): # filter participants to include
            if 'ses-1' in file and '.npy' in file and 'BAD' not in file: # filter first session, .npy files, and non-bad files
                filepath = os.path.join(subdir, file) # path to eeg file

                # needs specific info object, because has one less channel
                info = mne.create_info(ch_names=ch_names[:32], ch_types=ch_types[:32], sfreq=500)
                info.set_montage(montage=montage, on_missing= 'raise')

                preprocessed_eeg = np.load(filepath, allow_pickle = True)
                raw = mne.io.RawArray(np.squeeze(preprocessed_eeg['data']), info)

                # epoch the data
                epochs = mne.make_fixed_length_epochs(raw, duration = 9.95, overlap = 0)

                if 'EC' in file:
                    cond = 'EC'
                if 'EO' in file:
                    cond = 'EO'

                # determine age, gender, and diagnosis of participant
                age = df_participants.loc[df_participants['participants_ID'] == file.split('_')[0], 'age'].values[0]
                gender = df_participants.loc[df_participants['participants_ID'] == file.split('_')[0], 'gender'].values[0]
                diagnosis = df_participants.loc[df_participants['participants_ID'] == file.split('_')[0], 'diagnosis'].values[0]


                # add data to empty dictionary
                feature_dict = {}
                feature_dict['ID'] = [file.split('_')[0]] * epochs.get_data().shape[0]
                feature_dict['age'] = [age] * epochs.get_data().shape[0]
                feature_dict['gender'] = [gender] * epochs.get_data().shape[0]
                feature_dict['diagnosis'] = [diagnosis] * epochs.get_data().shape[0]
                #feature_dict['EO/EC'] = [cond] * epochs.get_data().shape[0]
                feature_dict['epoch'] = list(range(1, epochs.get_data().shape[0] + 1))

                # calculate psd for each epoch of frontal channels
                spectrum = epochs.compute_psd(method='welch', fmin=8, fmax=13, picks=['Fp1', 'Fp2', 'F7', 'F8', 'F3', 'F4', 'FC3', 'FC4'])
                psds, freqs = spectrum.get_data(return_freqs=True)

                # Compute mean alpha power per channel
                psd_alpha = np.mean(psds, axis=2)
                # print(f'{psds.shape = }')
                # print(f'{psd_alpha.shape = }')
                # Creating dataframe
                df = pd.DataFrame(data=psd_alpha,
                                  columns=['Fp1', 'Fp2', 'F7', 'F8', 'F3', 'F4', 'FC3', 'FC4'],
                                  index=list(range(1, epochs.get_data().shape[0] + 1))
                                  )
                # print(df)

                # calculate frontal alpha asymmetry
                asymmetry = pd.DataFrame(columns=['Fp2-Fp1', 'F8-F7', 'F4-F3', 'FC4-FC3'], index=list(range(1, epochs.get_data().shape[0] + 1)))
                even_columns = ["Fp2", "F8", "F4", "FC4"]
                uneven_columns = ["Fp1", "F7", "F3", "FC3"]
                asymmetry[["Fp2-Fp1", "F8-F7", "F4-F3", "FC4-FC3"]] = (df[even_columns] - df[uneven_columns].values) * 1e12
                asymmetry.index.name='Epoch'
                # print(asymmetry)

                # Convert asymmetry DataFrame to dictionary
                asymmetry_dict = asymmetry.to_dict(orient='list')
                # print(asymmetry_dict)

                if cond == 'EC':
                    asymmetry_dict = {key + '_ec': value for key, value in asymmetry_dict.items()}
                if cond == 'EO':
                    asymmetry_dict = {key + '_eo': value for key, value in asymmetry_dict.items()}

                # Merge asymmetry_dict into feature_dict
                feature_dict.update(asymmetry_dict)
                # print(feature_dict)

                # add to dataframe
                if cond == 'EC':
                    df_ec_features = pd.concat([df_ec_features, pd.DataFrame(feature_dict)], ignore_index = True)
                if cond == 'EO':
                    df_eo_features = pd.concat([df_eo_features, pd.DataFrame(feature_dict)], ignore_index = True)

                print(f'\rProgress: {count}/{total_files} files processed.', end = '')
                count += 1

# merge EO and EC dataframes
df_features = pd.merge(df_eo_features, df_ec_features.drop(columns=['age', 'gender', 'diagnosis']),  how='outer', on=['ID', 'epoch'])
del df_ec_features, df_eo_features # remove dataframes to free up memory
print(f'\n{df_features.shape = }')
df_features.sample(12)

Progress: 898/898 files processed.
df_features.shape = (7632, 13)


Unnamed: 0,ID,age,gender,diagnosis,epoch,Fp2-Fp1_eo,F8-F7_eo,F4-F3_eo,FC4-FC3_eo,Fp2-Fp1_ec,F8-F7_ec,F4-F3_ec,FC4-FC3_ec
1938,sub-88034149,36.65,1.0,MDD,11,56235790000.0,642780600000.0,189846800000.0,285493900000.0,-1644044000000.0,-1541106000000.0,-3622478000000.0,-2067090000000.0
2060,sub-88035049,58.79,0.0,MDD,6,1063323000000.0,-128942900000.0,463210600000.0,-1534573000000.0,324128500000.0,-243959100000.0,265916900000.0,-1129906000000.0
6987,sub-88075769,67.4,0.0,MDD,4,-347114800000.0,462324500000.0,446251700000.0,272857900000.0,-225294900000.0,-120975200000.0,-476770000000.0,-234837900000.0
5512,sub-88050169,26.63,1.0,MDD,5,98916300000.0,-200860600000.0,66035190000.0,-748262900000.0,-358927700000.0,-38423110000.0,-914523500000.0,-2378076000000.0
6037,sub-88059169,49.06,0.0,MDD,2,41595850000.0,199394700000.0,-688075400000.0,-983201800000.0,-381184000000.0,-969646300000.0,-356274000000.0,-221590900000.0
1974,sub-88034733,58.34,1.0,MDD,8,-195688400000.0,-419923800000.0,-23765950000.0,-169912500000.0,-141018700000.0,-36304220000.0,-49104970000.0,59123350000.0
2960,sub-88040449,56.1,0.0,MDD,3,68308330000.0,-824303200000.0,-93220700000.0,337786700000.0,-186708400000.0,-393033200000.0,-209353900000.0,-609317200000.0
6847,sub-88072573,71.17,1.0,MDD,8,-90236680000.0,73428400000.0,-67947080000.0,49795730000.0,-120976800000.0,404100400000.0,161072200000.0,122940000000.0
2008,sub-88034645,49.14,0.0,MDD,5,-123489300000.0,-543267300000.0,-566213100000.0,-1186329000000.0,1040059000000.0,-500959500000.0,376437900000.0,-1695746000000.0
2897,sub-88039773,24.18,1.0,MDD,11,166794700000.0,-34842080000.0,-144993200000.0,44559580000.0,376264600000.0,-3194833000.0,231479300000.0,177108100000.0


In [None]:
ec = df_features.loc[:, df_features.columns.str.contains('ec')]
df_eo = df_features.loc[:, df_features.columns.str.contains('eo')]

In [None]:
df_features.sample(3)

Unnamed: 0,ID,age,gender,diagnosis,epoch,Fp2-Fp1_eo,F8-F7_eo,F4-F3_eo,FC4-FC3_eo,Fp2-Fp1_ec,F8-F7_ec,F4-F3_ec,FC4-FC3_ec
4208,sub-88045989,29.63,1.0,MDD,3,326390900000.0,-410130700000.0,-204128400000.0,-1276234000000.0,141656600000.0,-368617000000.0,309836800000.0,-236406300000.0
5570,sub-88049905,46.02,1.0,MDD,1,125626400000.0,-254950400000.0,66019910000.0,-61263430000.0,509636400000.0,1026067000000.0,444066400000.0,697039200000.0
6851,sub-88072573,71.17,1.0,MDD,12,139099700000.0,364548500000.0,222203500000.0,294891200000.0,-36255470000.0,-189278000000.0,636980700000.0,666249200000.0


In [None]:
df_features.isna().sum()

ID              0
age           144
gender         24
diagnosis      24
epoch           0
Fp2-Fp1_eo     24
F8-F7_eo       24
F4-F3_eo       24
FC4-FC3_eo     24
Fp2-Fp1_ec     48
F8-F7_ec       48
F4-F3_ec       48
FC4-FC3_ec     48
dtype: int64

In [None]:
df_features = df_features


In [None]:
df_features.to_pickle('/content/drive/MyDrive/TD-BRAIN/extracted_features/df_stat_features.pkl')