### Read in data

The purpose of this notebook is to read in the raw data files (coming from the VR/eyetracking data file).
Another helper-file contains a list of the videos, providing us with the order in which the videos appeared.

#### load relevant modules

In [1]:
import os, glob, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from scipy.stats import zscore

import numpy as np
from scipy import interpolate

def fill_nan(A):
    '''
    interpolate to fill nan values
    '''
    inds = np.arange(A.shape[0])
    good = np.where(np.isfinite(A))
    f = interpolate.interp1d(inds[good], A[good],bounds_error=False)
    B = np.where(np.isfinite(A),A,f(inds))
    return B


In [4]:
import sys
sys.version


'3.10.12 (main, Jul  5 2023, 15:02:25) [Clang 14.0.6 ]'

In [5]:
sys.executable

'/Users/schmaelz/miniconda3/envs/vr_pupil_study_env/bin/python'

#### setup vars and subjs

In [6]:
video_onset_times = (np.arange(39,1200,39))
print(video_onset_times)

folders = glob.glob("../data/00_raw_data/sub*")
folders.sort()
subjs = []
for f in folders:
    subjs.append(f[-6:])

print(len(subjs))
subjs[:3]

[  39   78  117  156  195  234  273  312  351  390  429  468  507  546
  585  624  663  702  741  780  819  858  897  936  975 1014 1053 1092
 1131 1170]
59


['sub001', 'sub002', 'sub003']

#### read in files

we loop over subjects, read in each subject's eye-tracking file, parse the file based on each video onset, assign the video's names, and save everything to a new folder "parsed_video_data". In that folder, we'll have for every subject (folders) that subject's individual video-files, specifically a time-code and a pupil-dilation value.

In [7]:
for curr_sub in range(len(subjs)):

    print(subjs[curr_sub])

    # assemble filepaths for the subjects
    curr_sub_eye_file      = '../data/00_raw_data/' + subjs[curr_sub] + '/'+ subjs[curr_sub] +'_tracking_data_trial_1.txt'

    #read in fixation information
    eye_df = pd.read_csv(curr_sub_eye_file, sep = '\t')
    eye_df = eye_df.drop(['Unnamed: 9'], axis=1)
    eye_df.head()
    
    
    curr_sub_video_file      = '../data/00_raw_data/' + subjs[curr_sub] + '/'+ subjs[curr_sub] +'_videolist.csv'
    video_file_names = pd.read_csv(curr_sub_video_file)

    #video started: C:/Users/CAS.CARISMA/Desktop/VR-Video/resources/stimuli/commercial_cookies_30s_100.mp4
    #select fixations and count them
    #contain_values = eye_df[eye_df['flag    '].str.contains('video started')]
    #contain_values.head()

    #for row in contain_values.itertuples():
    for curr_video in range(30):

        #print(curr_video)
        curr_video_name = video_file_names['filename'][curr_video]
        #print(curr_video_name)

        #curr_video_onset  = contain_values['seconds '].iloc[0]
        curr_video_onset  = eye_df.iloc[(eye_df['seconds '] - video_onset_times[curr_video]).abs().argsort()[0],:]['seconds ']
        curr_video_offset = curr_video_onset + 39

        #current_video_name = contain_values['flag    '].iloc[0].replace('video started: C:/Users/CAS.CARISMA/Desktop/VR-Video/resources/stimuli/', '')[:-4]
        #print(current_video_name)
        current_video_name = curr_video_name[:-4] #+ str(curr_video)

        out_path = '../data/01_parsed_video_data/' + subjs[curr_sub] + '/'
        out_file_name          = out_path  + current_video_name + '.csv'  #

        isExist = os.path.exists(out_path)
        if not isExist:
           os.makedirs(out_path)

        #eye_df[eye_df['seconds '] == contain_values['seconds '].iloc[0]]

        curr_video_df = eye_df[ eye_df['seconds '].between(curr_video_onset, curr_video_offset)]
        #curr_video_df['pupil diameter'].plot()

        curr_video_df['seconds '] = curr_video_df['seconds '] - curr_video_df['seconds '].iloc[0]

        curr_video_df['pupil diameter'].replace(0, np.nan, inplace=True)


        curr_video_df['time'] = pd.to_datetime(curr_video_df['seconds '], unit='s')#.apply(lambda x: x.time())
        #curr_video_df
        curr_video_df = curr_video_df.set_index('time')
        curr_video_df

        curr_video_df = curr_video_df[~curr_video_df.index.duplicated(keep='first')]

        curr_video_df_resampled = curr_video_df.resample('50ms').ffill()

        curr_video_df_resampled['seconds_resampled'] = curr_video_df_resampled.index
        curr_video_df_resampled['seconds_resampled'] = curr_video_df_resampled['seconds_resampled'] - curr_video_df_resampled['seconds_resampled'][0]

        curr_video_df_resampled['time'] = curr_video_df_resampled['seconds_resampled'].dt.total_seconds()
        curr_video_df_resampled.drop(columns = ['seconds_resampled', 'seconds ', 'position x', 'position y', 'position z' ], inplace =True)
        curr_video_df_resampled.reset_index(drop=True, inplace=True)

        order = [5,0,1,2,3,4] # setting column's order
        curr_video_df_resampled = curr_video_df_resampled[[curr_video_df_resampled.columns[i] for i in order]]


        curr_video_df_resampled.to_csv(out_file_name, index=False)
        del curr_video_df_resampled, curr_video_df
    del eye_df


sub001
sub002
sub003
sub004
sub005
sub006
sub007
sub008
sub009
sub010
sub011
sub012
sub013
sub014
sub015
sub016
sub017
sub018
sub019
sub020
sub021
sub022
sub023
sub024
sub025
sub026
sub027
sub028
sub029
sub030
sub031
sub032
sub033
sub034
sub035
sub037
sub038
sub039
sub040
sub041
sub042
sub043
sub044
sub045
sub046
sub047
sub048
sub049
sub050
sub051
sub052
sub053
sub054
sub055
sub056
sub057
sub058
sub059
sub060
