# Imports

In [1]:
import pyxdf
import pandas as pd
import numpy as np
import sounddevice as sd
from glob import glob
from tqdm import tqdm
import datetime

# Load Data

In [3]:
sub_files = glob('/Users/camilla.strauss/Desktop/CUNY_Data/Data/*/*.xdf')
sub_files

['/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5318014/sub-P5318014_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5942694/sub-P5942694_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-5182010/sub-P5182010_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5287460/sub-P5287460_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5447527/sub-P5447527_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-5958030/sub-P5958030_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5899288/sub-P5899288_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5813427/sub-P5813427_ses-S001_task-CUNY_run-001_mobi.xdf',
 '/Users/camilla.strauss/Desktop/CUNY_Data/Data/sub-P5070899/sub-P5070899_ses-S001_task-CUNY_run-001_mobi.

In [4]:
xdf_path = sub_files[1]
data, header = pyxdf.load_xdf(xdf_path)
streams_collected = [stream['info']['name'][0] for stream in data]


'''
HELPERS
'''
def get_event_data(event, df, stim_df):
    return df.loc[(df.lsl_time_stamp >= stim_df.loc[stim_df.event == 'Onset_'+event, 'lsl_time_stamp'].values[0]) & 
                  (df.lsl_time_stamp <= stim_df.loc[stim_df.event == 'Offset_'+event, 'lsl_time_stamp'].values[0])]

def get_secs_between_triggers(trigger1, trigger2, stim_df):
    return stim_df.loc[stim_df.trigger == trigger1, 'time'].values[0] - stim_df.loc[stim_df.trigger == trigger2, 'time'].values[0]

                  
streams_collected

['Stimuli_Markers',
 'WebcamStream',
 'OpenSignals',
 'Tobii',
 'EGI NetAmp 0',
 'Microphone']

# Stimulus

In [5]:
stim_dat = data[streams_collected.index('Stimuli_Markers')]
stim_df = pd.DataFrame(stim_dat['time_series'])
stim_df.rename(columns={0: 'trigger'}, inplace=True)
events = {
    200: 'Onset_Experiment',
    10: 'Onset_RestingState',
    11: 'Offset_RestingState',
    500: 'Onset_StoryListening',
    501: 'Offset_StoryListening',
    100: 'Onset_10second_rest',
    101: 'Offset_10second_rest', 
    20: 'Onset_CampFriend',
    21: 'Offset_CampFriend',
    30: 'Onset_FrogDissection',
    31: 'Offset_FrogDissection',
    40: 'Onset_DanceContest',
    41: 'Offset_DanceContest',
    50: 'Onset_ZoomClass',
    51: 'Offset_ZoomClass',
    60: 'Onset_Tornado',
    61: 'Offset_Tornado',
    70: 'Onset_BirthdayParty',
    71: 'Offset_BirthdayParty',
    300: 'Onset_subjectInput',
    301: 'Offset_subjectInput',
    302: 'Onset_FavoriteStory',
    303: 'Offset_FavoriteStory',
    304: 'Onset_WorstStory',
    305: 'Offset_WorstStory',
    400: 'Onset_impedanceCheck',
    401: 'Offset_impedanceCheck',
    80: 'Onset_SocialTask',
    81: 'Offset_SocialTask',
    201: 'Offset_Experiment',
}

story_onsets = [20, 30, 40, 50, 60, 70]

# relabel the event if the trigger is in the events dictionary, else if 
stim_df['event'] = stim_df['trigger'].apply(lambda x: events[x] if x in events.keys() else 'Bx_input')

# relabel the event as a psychopy timestamp if the trigger is greater than 5 digits
stim_df.loc[stim_df.trigger.astype(str).str.len() > 5, 'event'] = 'psychopy_time_stamp'
stim_df['lsl_time_stamp'] = stim_dat['time_stamps']
stim_df['time'] = (stim_dat['time_stamps'] - stim_dat['time_stamps'][0])/1000
stim_df

Unnamed: 0,trigger,event,lsl_time_stamp,time
0,2.000000e+02,Onset_Experiment,400235.007511,0.000000e+00
1,1.734384e+09,psychopy_time_stamp,400235.007525,1.439976e-08
2,1.000000e+01,Onset_RestingState,400256.042775,2.103526e-02
3,1.734384e+09,psychopy_time_stamp,400256.042785,2.103527e-02
4,1.100000e+01,Offset_RestingState,400556.042791,3.210353e-01
...,...,...,...,...
256,1.734387e+09,psychopy_time_stamp,402853.174749,2.618167e+00
257,2.010000e+02,Offset_Experiment,402858.180502,2.623173e+00
258,1.734387e+09,psychopy_time_stamp,402858.180511,2.623173e+00
259,4.000000e+00,Bx_input,402858.180585,2.623173e+00


# Eye Tracking Data

In [6]:
ET = data[streams_collected.index('Tobii')]
et_dat = ET['time_series']
# Get the column names
column_labels = [ET['info']['desc'][0]['channels'][0]['channel'][i]['label'][0] for i in range(len(ET['info']['desc'][0]['channels'][0]['channel']))]

et_df = pd.DataFrame(data=et_dat, columns=column_labels)
et_df['lsl_time_stamp'] = ET['time_stamps']
et_df['time'] = (ET['time_stamps'] - ET['time_stamps'][0])/1000
et_df.columns

Index(['device_time_stamp', 'left_gaze_origin_validity',
       'right_gaze_origin_validity',
       'left_gaze_origin_in_user_coordinate_system_0',
       'left_gaze_origin_in_user_coordinate_system_1',
       'left_gaze_origin_in_user_coordinate_system_2',
       'right_gaze_origin_in_user_coordinate_system_0',
       'right_gaze_origin_in_user_coordinate_system_1',
       'right_gaze_origin_in_user_coordinate_system_2',
       'left_gaze_origin_in_trackbox_coordinate_system_0',
       'left_gaze_origin_in_trackbox_coordinate_system_1',
       'left_gaze_origin_in_trackbox_coordinate_system_2',
       'right_gaze_origin_in_trackbox_coordinate_system_0',
       'right_gaze_origin_in_trackbox_coordinate_system_1',
       'right_gaze_origin_in_trackbox_coordinate_system_2',
       'left_gaze_point_validity', 'right_gaze_point_validity',
       'left_gaze_point_in_user_coordinate_system_0',
       'left_gaze_point_in_user_coordinate_system_1',
       'left_gaze_point_in_user_coordinate_s

In [7]:
et_df

Unnamed: 0,device_time_stamp,left_gaze_origin_validity,right_gaze_origin_validity,left_gaze_origin_in_user_coordinate_system_0,left_gaze_origin_in_user_coordinate_system_1,left_gaze_origin_in_user_coordinate_system_2,right_gaze_origin_in_user_coordinate_system_0,right_gaze_origin_in_user_coordinate_system_1,right_gaze_origin_in_user_coordinate_system_2,left_gaze_origin_in_trackbox_coordinate_system_0,...,left_gaze_point_on_display_area_0,left_gaze_point_on_display_area_1,right_gaze_point_on_display_area_0,right_gaze_point_on_display_area_1,left_pupil_validity,right_pupil_validity,left_pupil_diameter,right_pupil_diameter,lsl_time_stamp,time
0,1.389011e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,400235.316109,0.000000
1,1.389011e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,400235.324443,0.000008
2,1.389011e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,400235.332777,0.000017
3,1.389011e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,400235.341110,0.000025
4,1.389011e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,400235.349444,0.000033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322441,1.391698e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,402922.416040,2.687100
322442,1.391698e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,402922.424373,2.687108
322443,1.391698e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,402922.432707,2.687117
322444,1.391698e+12,0.0,0.0,,,,,,,,...,,,,,0.0,0.0,,,402922.441041,2.687125


# Physio Data

In [8]:
PS = data[streams_collected.index('OpenSignals')]
# Get the column names
column_labels = [PS['info']['desc'][0]['channels'][0]['channel'][i]['label'][0] for i in range(len(PS['info']['desc'][0]['channels'][0]['channel']))]
ps_df = pd.DataFrame(data=PS['time_series'], columns=column_labels)
ps_df['lsl_time_stamp'] = PS['time_stamps']
ps_df['time'] = (PS['time_stamps'] - PS['time_stamps'][0])/1000

# Microphone Data 

In [9]:
mic_data = data[streams_collected.index('Microphone')]
mic_df = pd.DataFrame(mic_data['time_series'], columns=['int_array'])
mic_df['bytestring'] = mic_df['int_array'].apply(lambda x: np.array(x).tobytes())
mic_df['duration'] = (mic_data['time_stamps'] - mic_data['time_stamps'][0])/441000

mic_df['lsl_time_stamp'] = mic_data['time_stamps']
mic_df['time_from_last'] = mic_df['lsl_time_stamp'].diff()

mic_df.head()

Unnamed: 0,int_array,bytestring,duration,lsl_time_stamp,time_from_last
0,-25,b'\xe7\xff\xff\xff\xff\xff\xff\xff',0.0,400573.836485,
1,-30,b'\xe2\xff\xff\xff\xff\xff\xff\xff',5.14198e-11,400573.836508,2.3e-05
2,-28,b'\xe4\xff\xff\xff\xff\xff\xff\xff',1.028397e-10,400573.83653,2.3e-05
3,-29,b'\xe3\xff\xff\xff\xff\xff\xff\xff',1.542595e-10,400573.836553,2.3e-05
4,-31,b'\xe1\xff\xff\xff\xff\xff\xff\xff',2.056793e-10,400573.836576,2.3e-05


# Video Data

In [10]:
# Identify the frames we're interested in
cam_data = data[streams_collected.index('WebcamStream')]

#cam_df = pd.DataFrame(cam['time_series'], columns=['frame'])
#cam_df['lsl_timestamps'] = cam['time_stamps']
#cam_df['time'] = cam_df['lsl_timestamps'] - cam_df['lsl_timestamps'][0]
cam_data['time_series'][0]
frame_nums = [int(i[0]) for i in cam_data['time_series']]
time_pre = [float(i[1]) for i in cam_data['time_series']]
time_evnt_ms = [float(i[2]) for i in cam_data['time_series']]
time_post = [float(i[3]) for i in cam_data['time_series']]


cam_df = pd.DataFrame({'frame_num': frame_nums, 
                    'time_pre': time_pre, 
                    'cap_time_ms': time_evnt_ms,
                    'time_post': time_post,
                    'lsl_time_stamp': cam_data['time_stamps']})



cam_df['frame_time_sec'] = (cam_df.cap_time_ms - cam_df.cap_time_ms[0])/1000
cam_df['lsl_time_sec'] = cam_df.lsl_time_stamp #- cam_df.lsl_time_stamp[0]
cam_df['time'] = (cam_df.lsl_time_stamp - cam_df.lsl_time_stamp[0])/1000

# EEG Data

In [11]:
eeg_dat = data[streams_collected.index('EGI NetAmp 0')]
eeg_df = pd.DataFrame(eeg_dat['time_series'])
eeg_df['lsl_time_stamp'] = eeg_dat['time_stamps']


# Durations for Each Experiment Part

In [12]:
streams = ['et', 'ps', 'mic', 'cam', 'eeg']

# get durations of certain experiment arm
def get_durations(ExperimentPart):

    # find expected duration
    exp_start = stim_df.loc[stim_df.event == 'Onset_'+ExperimentPart, 'lsl_time_stamp'].values[0]
    exp_end = stim_df.loc[stim_df.event == 'Offset_'+ExperimentPart, 'lsl_time_stamp'].values[0]
    exp_dur = round(exp_end - exp_start, 4)

    # expected mm:ss
    exp_dt = datetime.timedelta(seconds=exp_dur)
    exp_dt_dur = str(datetime.timedelta(seconds=round(exp_dt.total_seconds())))

    # make + populate df
    df = pd.DataFrame(columns = ['stream', 'duration', 'mm:ss', 'percent'])
    for i, stream in enumerate(streams):
        # don't include mic in resting state
        if ExperimentPart == 'RestingState' and stream == 'mic':
            continue
        # grab data for stream + experiment part
        event_data = get_event_data(ExperimentPart, globals()[stream+'_df'], stim_df)

        # print if no data
        if event_data.empty:
            df.loc[i] = [stream, 0, str(datetime.timedelta(seconds=0)), '0.00%']
            print(stream + ' has no ' + ExperimentPart + ' data') 
            continue
        # calculate duration
        start = event_data['lsl_time_stamp'].values[0]
        stop = event_data['lsl_time_stamp'].values[-1]
        dur = round(stop - start, 4)

        # calculate hh:mm:ss
        dt = datetime.timedelta(seconds=dur)
        dt_dur = str(datetime.timedelta(seconds=round(dt.total_seconds())))

        # calculate percent 
        percent = '{}%'.format(round(dur/exp_dur * 100, 2))
             
        df.loc[i] = [stream, dur, dt_dur, percent]

    # print which are short
    for i in df.iterrows():
        if i[1]['duration'] == 0:
            continue
        if i[1]['duration'] < (exp_dur - 5): # 5 second margin
            print(i[1]['stream'] + ' is shorter than expected for ' + ExperimentPart + ' by ' + str(round(exp_dur - i[1]['duration'], 2)) + ' seconds')
    
    # print df
    df.loc[df.index.max() + 1] = ['expected', exp_dur, exp_dt_dur, '100.0%']
    df.sort_values(by='duration', inplace=True)
    print('\n' + ExperimentPart + ' DataFrame')
    return df
    

In [13]:
get_durations('Experiment')

mic is shorter than expected for Experiment by 338.83 seconds

Experiment DataFrame


Unnamed: 0,stream,duration,mm:ss,percent
2,mic,2284.344,0:38:04,87.08%
0,et,2622.8561,0:43:43,99.99%
3,cam,2623.1281,0:43:43,100.0%
1,ps,2623.171,0:43:43,100.0%
4,eeg,2623.1721,0:43:43,100.0%
5,expected,2623.173,0:43:43,100.0%


In [14]:
get_durations('RestingState')


RestingState DataFrame


Unnamed: 0,stream,duration,mm:ss,percent
0,et,299.9852,0:05:00,100.0%
3,cam,299.991,0:05:00,100.0%
1,ps,299.9967,0:05:00,100.0%
4,eeg,299.9984,0:05:00,100.0%
5,expected,300.0,0:05:00,100.0%


In [15]:
get_durations('StoryListening')

mic is shorter than expected for StoryListening by 17.79 seconds

StoryListening DataFrame


Unnamed: 0,stream,duration,mm:ss,percent
2,mic,1185.8071,0:19:46,98.52%
3,cam,1203.5666,0:20:04,100.0%
0,et,1203.5994,0:20:04,100.0%
1,ps,1203.5997,0:20:04,100.0%
4,eeg,1203.5998,0:20:04,100.0%
5,expected,1203.6008,0:20:04,100.0%


In [16]:
get_durations('SocialTask')


SocialTask DataFrame


Unnamed: 0,stream,duration,mm:ss,percent
3,cam,300.458,0:05:00,99.98%
0,et,300.5019,0:05:01,100.0%
1,ps,300.5105,0:05:01,100.0%
4,eeg,300.5114,0:05:01,100.0%
2,mic,300.5124,0:05:01,100.0%
5,expected,300.5124,0:05:01,100.0%


In [17]:
get_durations('CampFriend')


CampFriend DataFrame


Unnamed: 0,stream,duration,mm:ss,percent
3,cam,140.405,0:02:20,99.98%
0,et,140.4215,0:02:20,99.99%
4,eeg,140.4299,0:02:20,100.0%
1,ps,140.4302,0:02:20,100.0%
2,mic,140.4314,0:02:20,100.0%
5,expected,140.4314,0:02:20,100.0%


In [18]:
streams = ['et', 'ps', 'mic', 'cam', 'eeg']

# get duration of entire recording for each stream
def whole_durations():
    df = pd.DataFrame(columns = ['stream', 'duration', 'mm:ss'])
  
    # populate df
    for i, stream in enumerate(streams):  
        duration = globals()[stream+'_df']['lsl_time_stamp'].iloc[-1]- globals()[stream+'_df']['lsl_time_stamp'].iloc[0]
        duration = round(duration, 4)
        # convert to mm:ss
        whole_dt = datetime.timedelta(seconds=duration)
        whole_dt_dur = str(datetime.timedelta(seconds=round(whole_dt.total_seconds())))
        df.loc[i] = [stream, duration, whole_dt_dur]
    
    df.sort_values(by = 'duration', inplace = True)

    # percent
    max_dur = df.duration.max()
    # df['percent'] = '{}%'.format(round((df.duration)/max_dur * 100, 2))
    df['percent'] = round(df['duration']/max_dur*100, 2).astype(str) + '%'
    # df['percent'] = df['duration'].map(lambda x: '{:.2%}'.format(x/max_dur))

    # print which are short
    for i in df.iterrows():
        if i[1]['duration'] == 0:
            continue
        if i[1]['duration'] < (max_dur - 30): # 30 second margin
            print(i[1]['stream'] + ' is shorter than expected by ' + str(round(max_dur - i[1]['duration'], 2)) + ' seconds')
    
        
    df.sort_values(by = 'duration', inplace = True)
    return(df)


whole_durations()

mic is shorter than expected by 386.9 seconds
cam is shorter than expected by 43.47 seconds
et is shorter than expected by 43.01 seconds


Unnamed: 0,stream,duration,mm:ss,percent
2,mic,2343.2425,0:39:03,85.83%
3,cam,2686.6756,0:44:47,98.41%
0,et,2687.1333,0:44:47,98.42%
1,ps,2729.6173,0:45:30,99.98%
4,eeg,2730.1419,0:45:30,100.0%
