In [1]:
from modules.constants import *

In [None]:
# Define a function to create directories if they do not exist.
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Create directories for training and validation datasets.
create_dir(train_dir)
create_dir(val_dir)

# Loop over subjects (num_sub) and sessions (num_sess) to process individual EEG files.
for i in range(1, num_sub+1):
    for j in range(1, num_sess+1):
        # Construct the file path for the EEG data using formatted strings to ensure correct file naming.
        data_path = f'../../ds003774/sub-0{i//10}{i%10}/ses-{j//10}{j%10}/eeg/sub-0{i//10}{i%10}_ses-{j//10}{j%10}_task-MusicListening_run-{j}_eeg.set'
        # Read the EEG data using the MNE library, preloading it into memory.
        raw = read_raw_eeglab(data_path, preload=True)

        # Apply a high-pass filter with a cutoff at 0.2 Hz to remove slow drifts in the data.
        raw.filter(l_freq=0.2, h_freq=None)

        # Apply a notch filter at 50 Hz to remove electrical line noise.
        raw.notch_filter(freqs=[50])

        # Downsample the data to 256 Hz to reduce data size and computational complexity.
        raw.resample(256)

        # Select EEG channels and retrieve data and corresponding times.
        picks = pick_types(raw.info, eeg=True, exclude=[])
        data, times = raw.get_data(picks=picks, return_times=True)

        # Calculate power spectral densities (PSDs) using Welch's method, focusing on frequencies from 2 to 40 Hz.
        psds, freqs = psd_array_welch(data, sfreq=raw.info['sfreq'], fmin=2, fmax=40)

        # Calculate the mean and a threshold for PSD; the threshold is set at 3 times the standard deviation.
        psd_mean = psds.mean(axis=-1)
        psd_threshold = 3 * np.std(psds, axis=-1)

        # Identify and label bad channels based on the PSD criteria.
        bad_channels = [raw.ch_names[p] for p in picks if psd_mean[p] > psd_threshold[p]]
        raw.info['bads'] += bad_channels
        # Interpolate data for the bad channels.
        raw.interpolate_bads()

        # Apply Independent Component Analysis (ICA) for artifact rejection.
        ica = ICA(n_components=20, random_state=99, method='fastica')
        ica.fit(raw)
        ica.apply(raw)

        # Re-reference the EEG data to the average reference.
        raw.set_eeg_reference('average', projection=True)

        # Save the preprocessed EEG data in FIF format, determining the path based on subject number.
        pre_path = f'pre_eeg_sub-0{i//10}{i%10}_ses-{j//10}{j%10}_eeg.fif'
        if i <= 16:
            pre_path = os.path.join(train_dir, pre_path)  # Save to training directory for first 16 subjects.
        else:
            pre_path = os.path.join(val_dir, pre_path)  # Save to validation directory for others.
        raw.save(pre_path, overwrite=True)  # Save the file, allowing overwriting if the file already exists.
