In [1]:
""" 
Preproccess EEG data from BrainVision
Saves in .mat format, segmented by triggers
"""

################
## Imports
################
import mne
import os
import glob
import librosa
import numpy as np
import pandas as pd
from scipy.io import savemat
from scipy.signal import correlate
import matplotlib.pyplot as plt
from mne.preprocessing import ICA

from IPython.display import Audio, display

In [2]:
################
## EDIT THIS PART
################

# file = '../EEG_data_test/trig_test_2/Untitled2.vhdr'
file = '../../Data/Samet/multi1.vhdr'
log_path = '../../Data/Samet/trial_arrow_log.csv'
trial_log_csv = pd.read_csv(log_path)
if file == '../../Data/Samet/multi1.vhdr':
    trial_log = trial_log_csv[:10]
elif file == '../../Data/Samet/multi2.vhdr':
    trial_log = trial_log_csv[10:19]
elif file == '../../Data/Samet/multi3.vhdr':
    trial_log = trial_log_csv[19:29]
elif file == '../../Data/Samet/multi4.vhdr':
    trial_log = trial_log_csv[29:38]
print(trial_log.shape)

filename = file.split('/')[-1].split('.')[0]
exp_type = file.split('/')[-1].split('.')[0].split('_')[-1]
# exp_type = 'mixed' 

output_dir = '../../Data/Samet/Preprocessed/preprocessed_mixed_01_15Hz'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# overwrite = False

(10, 8)


In [3]:
trial_log_csv

Unnamed: 0,Trial,Round,SpeechFile,MusicFile,LeftFile,RightFile,FirstHalfAttend,SecondHalfAttend
0,1,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech
1,2,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech
2,3,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Speech,Music
3,4,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech
4,5,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech
5,6,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Speech,Music
6,7,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Speech,Music
7,8,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech
8,9,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech
9,10,1,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/piano_only_long_cropped_22kHz/piano_4_...,Stimuli/speech_only_short_22kHz/jane_eyre_05_p...,Music,Speech


In [4]:
################
## Parameters
################
plot = False
# FS_ORIG = 25000  # Hz

# Preprocessing
# Notch filtering
notch_applied = False
freq_notch = 50

# Bandpass filtering
bpf_applied = True
freq_low   = 0.1
freq_high  = 15
bandpass = str(freq_low) + '-' + str(freq_high)
ftype = 'butter'
order = 3

# Spherical interpolation
int_applied = False
interpolation = 'spline'

# Rereferencing using average of mastoids electrodes
reref_applied = True
reref_type = 'average'  #channels or average
reref_channels = None

# Downsampling
down_applied = True
downfreq = 128
if not down_applied:
    downfreq = 'N/A'

# ICA
ica_applied = False
ica_method = 'manual' #eog or manual. eog requires eog channels in the data.


################
## Read and crop data
################
raw = mne.io.read_raw_brainvision(file, preload=True)

#get info from raw
FS_ORIG = raw.info['sfreq']
ch_names = raw.ch_names
events, event_id = mne.events_from_annotations(raw)
print(events)
#crop to start
# exp_start = events[0][0] / FS_ORIG
# exp_end = raw.times[-1]
# eeg = raw.copy().crop(tmin = exp_start,
#                       tmax = exp_end)


print('loaded')
################
## Preprocess
################
df_pre = pd.DataFrame()

## -------------
## Select channels
## -------------
eeg_channels = ch_names[0:31]
eeg = raw.copy()
eeg = eeg.pick_channels(eeg_channels)
if plot:
    eeg.plot(start=100, duration=10, n_channels=len(raw.ch_names))

## -------------
## Notch filtering
## -------------
df_pre['notch_applied'] = [notch_applied]
if notch_applied:
    eeg = eeg.notch_filter(freqs=freq_notch)
    df_pre['notch'] = [freq_notch]
    if plot:
        eeg.plot()

## -------------
## BPFiltering
## -------------
df_pre['bpf_applied'] = [bpf_applied]
if bpf_applied:
    iir_params = dict(order=order, ftype=ftype)
    filter_params = mne.filter.create_filter(eeg.get_data(), eeg.info['sfreq'], 
                                            l_freq=freq_low, h_freq=freq_high, 
                                            method='iir', iir_params=iir_params)

    if plot:
        flim = (1., eeg.info['sfreq'] / 2.)  # frequencies
        dlim = (-0.001, 0.001)  # delays
        kwargs = dict(flim=flim, dlim=dlim)
        mne.viz.plot_filter(filter_params, eeg.info['sfreq'], compensate=True, **kwargs)
        # plt.savefig(os.path.join(output_dir, 'bpf_ffilt_shape.png'))

    eeg = eeg.filter(l_freq=freq_low, h_freq=freq_high, method='iir', iir_params=iir_params)
    df_pre['bandpass'] = [iir_params]
    df_pre['HPF'] = [freq_low]
    df_pre['LPF'] = [freq_high]
    if plot:
        eeg.plot()

## -------------
## Intrpolation
## -------------
df_pre['int_applied'] = [int_applied]
if int_applied: 
    eeg = eeg.interpolate_bads(reset_bads=False)  #, method=interpolation

    # Get the indices and names of the interpolated channels
    interp_inds = eeg.info['bads']
    interp_names = [eeg.info['ch_names'][i] for i in interp_inds]

    # Print the number and names of the interpolated channels
    print(f'{len(interp_inds)} channels interpolated: {interp_names}')

    df_pre['interpolation'] = [interpolation]
    df_pre['interp_inds'] = [interp_inds]
    df_pre['interp_names'] = [interp_names]

    if plot:
        eeg.plot()

## -------------
## Rereferencing
## -------------
df_pre['reref_applied'] = [reref_applied]
if reref_applied:
    if reref_type == 'average':
        # reref to average
        eeg = eeg.set_eeg_reference(ref_channels='average')
        df_pre['reref_type'] = [reref_type]
        df_pre['reref_channels'] = ['average']
        if plot:
            eeg.plot()

    elif reref_type == 'channels':
        # reref to a channel
        eeg = eeg.set_eeg_reference(ref_channels=reref_channels)
        df_pre['reref_type'] = [reref_type]
        df_pre['reref_channels'] = [reref_channels]
        if plot:
            eeg.plot()

## -------------
## Resampling
## -------------
df_pre['down_applied'] = [down_applied]
df_pre['downfreq'] = [downfreq]
if down_applied:
    eeg = eeg.resample(sfreq=downfreq)
    print(eeg.info)
    if plot:
        eeg.plot()



#--------------------------------------------
#               ICA
#--------------------------------------------
df_pre['ica_applied'] = ica_applied
df_pre['ica_method'] = ica_method

if ica_applied:
    ica = ICA(max_iter="auto", random_state=97)
    ica.fit(eeg)

    #exclude components 
    if ica_method == 'eog': #automatically exclude components correlated with EOG
        eog_indices, eog_scores = ica.find_bads_eog(eeg)
        print('Rejecting components:', eog_indices)
        ica.exclude = eog_indices
    elif ica_method == 'manual': #plot components and select manually
        ica.plot_components()
        to_exclude = input('Select components to exclude (comma separated): ')
        ica.exclude = [int(x.strip()) for x in to_exclude.split(',') if x.strip().isdigit()]
        # ica.exclude = [0] #test
        
    # reconst_raw = raw.copy()
    # ica.apply(reconst_raw)
    ica.apply(eeg)

## -------------
## Save preprocessing stages
## -------------
df_pre.to_csv(os.path.join(output_dir, filename+'_pp_record.csv'), index=False)


################
## SEGMENT DATA
################

#segment to 30s chunks of speech or music attended
# trial_starts = events[events[:, 2] == 1]

# print(trial_starts)
for i in range(len(trial_log)):
    print(f'Segmenting trial {i+1}/{len(trial_log)}')

    # Load long audio at 1000 Hz
    long_audio = raw.get_data()[31,:]
    long_sr = 1000
    
    # Load short audio at its native rate (22050 Hz)
    short_audio, short_sr = librosa.load(f'../../{trial_log.iloc[i]['LeftFile'][:8]+'Cindy/'+trial_log.iloc[i]['LeftFile'][8:]}', sr=None)
        
    
    # Resample short audio to match long audio's sampling rate
    short_audio_resampled = librosa.resample(short_audio, orig_sr=short_sr, target_sr=long_sr)
    
    # Normalize both signals
    long_audio = (long_audio - np.mean(long_audio)) / np.std(long_audio)
    short_audio_resampled = (short_audio_resampled - np.mean(short_audio_resampled)) / np.std(short_audio_resampled)
    
    # Cross-correlation to find best match
    correlation = correlate(long_audio, short_audio_resampled, mode='valid')
    best_match_index = np.argmax(correlation)
    end_index = best_match_index + len(short_audio_resampled)
    
    print(f"Best match found at index range: {best_match_index} to {end_index}")
    
    eeg_trial = eeg.copy().crop(tmin = best_match_index/FS_ORIG, tmax = end_index/FS_ORIG)


    ################
    ## Crop into different trials
    ################
    eeg_trial_np = eeg_trial.get_data()
    print(eeg_trial_np.shape)

    crop_params = [
        (0, 30*downfreq, 'FirstHalfAttend'),
        (30*downfreq, 60*downfreq, 'SecondHalfAttend')
    ]

    for tmin, tmax, stim_key in crop_params:
        stim = trial_log.iloc[i][stim_key]

        music_file = trial_log.iloc[i]['MusicFile'].split('/')[-1].split('.')[0]
        speech_file = trial_log.iloc[i]['SpeechFile'].split('/')[-1].split('.')[0]

        eeg_cropped = eeg_trial_np[:, int(tmin):int(tmax)]
        if filename == 'multi1':
            filename_out = f"samet_{stim}_{i}.mat"
        elif filename == 'multi2':
            filename_out = f"samet_{stim}_{i+10}.mat"
        elif filename == 'multi3':
            filename_out = f"samet_{stim}_{i+19}.mat"
        elif filename == 'multi4':
            filename_out = f"samet_{stim}_{i+29}.mat"
        
        
        filepath = os.path.join(output_dir, filename_out)


        savemat(filepath, {'eeg_data': eeg_cropped,
                           'stimuli_music': music_file,
                           'stimuli_speech': speech_file,
                           'stim_attended': stim,
                           'stim_attended_pos': stim_key})
        

Extracting parameters from ../../Data/Samet/multi1.vhdr...
Setting channel info structure...
Reading 0 ... 623239  =      0.000 ...   623.239 secs...
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3']
[[     0      0  99999]
 [  7664      0      1]
 [ 37232      0      3]
 [ 68689      0      1]
 [ 98198      0      3]
 [129682      0      1]
 [159180      0      2]
 [190704      0      1]
 [220247      0      3]
 [251781      0      1]
 [281347      0      3]
 [312869      0      1]
 [342377      0      2]
 [373887      0      1]
 [403448      0      2]
 [434942      0      1]
 [464460      0      3]
 [495951      0      1]
 [525496      0      3]
 [556968      0      1]
 [586455      0      3]]
loaded
NOTE: pick_channels() is a legacy function. New code should use inst.pick(...).


  raw = mne.io.read_raw_brainvision(file, preload=True)
['Soundwave']
Consider setting the channel types to be of EEG/sEEG/ECoG/DBS/fNIRS using inst.set_channel_types before calling inst.set_montage, or omit these channels when creating your montage.
  raw = mne.io.read_raw_brainvision(file, preload=True)


Setting up band-pass filter from 0.1 - 15 Hz

IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward and reverse) non-causal filter:
- Filter order 12 (effective, after forward-backward)
- Cutoffs at 0.10, 15.00 Hz: -6.02, -6.02 dB

Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.1 - 15 Hz

IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward and reverse) non-causal filter:
- Filter order 12 (effective, after forward-backward)
- Cutoffs at 0.10, 15.00 Hz: -6.02, -6.02 dB

EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
<Info | 9 non-empty values
 bads: []
 ch_names: Fp1, Fz, F3, F7, FT9, FC5, FC1, C3, T7, TP9, CP5, CP1, Pz, P3, ...
 chs: 31 EEG
 custom_ref_applied: True
 dig: 34 items (3 Cardinal, 31 EEG)
 highpass: 0.1 Hz
 lowpass: 15.0 Hz
 meas_date: 2025-07-10 11:22:05 UTC
 nchan: 31
 projs: []
 sfreq: 128.0 Hz
>