Analysis of eye tracking files from Spiro study.
Gaze dispersion metric based on Christoforou et al, 2015: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4428128/pdf/fpsyg-06-00579.pdf
The objective is as follows:
(1) compute within-subject dispersion metric, based on short sections of the film (250 ms with 50 ms shift, ie 80% overlap)
(2) get sections with extreme dispersion score


In [1]:
# analyse film data, v3 Oct/Nov 2020

# read and prep Tobii .tsv files from CortEx study
import os
from datetime import datetime
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import nanmedian, nanmean, nanstd
from scipy.spatial import distance
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

Utils

In [137]:
from scipy.stats import norm

def prepare_segment(x_gaze,y_gaze,start_num, stop_num):
    #Scene1
    dfx_scene = x_gaze[start_num:stop_num]
    dfy_scene = y_gaze[start_num:stop_num]

    # get rid of nans
    nan_dfx = dfx_scene.where(dfx_scene>0,0)
    nan_dfx = nan_dfx.where(nan_dfx<1280,0)
    nan_dfy = dfy_scene.where(dfy_scene>0,0)
    nan_dfy = nan_dfy.where(nan_dfy<1024,0)
    x_m = nan_dfx.where(nan_dfx==0,1)
    y_m = nan_dfy.where(nan_dfy==0,1)
#    mask_dfx = nan_dfx.replace(x_m,1)
#    mask_dfy = nan_dfy.replace(y_m,1)

    dfx_scene = nan_dfx*y_m
    dfy_scene = nan_dfy*x_m
    return dfx_scene, dfy_scene

def score_scenes(x_scene, y_scene):
    scene_scores = []
    # concatentate x and y values:
    for k in range(len(x_scene)):
        concat_ps = pd.concat([x_scene.iloc[k,:],y_scene.iloc[k,:]],axis = 1)
        dist_ps = distance.pdist(concat_ps, metric = 'euclidean')
        comp_ps = []
        for l in range(len(dist_ps)):
            score_ps = np.nansum(dist_ps[l])/len(concat_ps)
            comp_ps.append(score_ps)
        tot_score = sum(comp_ps)/len(concat_ps)
        scene_scores.append(tot_score)
    return comp_ps,scene_scores
# get participant numbers

def get_pnums(infiles):
    """
    Get participant numbers from filenames.

    Parameters
    ----------
    infiles: array of input file names

    Returns
    -------
    list of participant numbers 
    """
    id = []
    for i,filenames in enumerate(infiles):
        pnum = [int(s) for s in filenames.split('_') if s.isdigit()]
        id.append([pnum])

def flatten(list_name):
    """ 
    Function that flattens list of lists.
    Source: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists

    Parameters
    ----------
    list_name: str
        Name of the list to flatten
    
    Returns
    -------
    Flattened list
    """
    flattened = [item for sublist in list_name for item in sublist]
    return flattened



def get_random_samps(df, frame_num_max, id):
    """ 
    Fit a participant's gaze data to normal distribution.
    Randomly sample normal distribution with participant-specific mean and std.

    Parameters
    ----------
    df: pandas dataframe
        dataframe holding eye tracking data from all participants

    frame_num_max: int
        maximum number of frames for participant, will be used to determine how many samples to draw

    id: int
        participant id 
    
    Returns
    -------
    Simulated gaze data in x and y.

    """
    # first, let's simulate some gaze data.
    # some basic parameters
    #gaze_length = df.frame_num.max() # number of samples we need to generate
    mux,stdx = norm.fit(df.loc[df.id == id,'gaze_in_x'])
    muy,stdy = norm.fit(df.loc[df.id == id, 'gaze_in_y'])
    # get samples
    sim_in_x = norm.rvs(loc = mux, scale = stdx, size = frame_num_max)
    sim_in_y = norm.rvs(loc = muy, scale = stdy, size = frame_num_max)
    return sim_in_x, sim_in_y




In [2]:
class Tobii_file:
    def __init__(self, filename, film_dur_s, screenres_x = None, screenres_y = None, s_rate= None):
        """
    
        """
        if not screenres_x:
            self.screen_resx = 1280
        if not screenres_y:
            self.screen_resy = 1024
        if not s_rate:
            s_rate = 60
        self.filename = filename
        self.screen_res = [screenres_x, screenres_y]
        self.s_rate = s_rate
        self.film_dur_s = film_dur_s
        self.pnum = [int(s) for s in filename.split('_') if s.isdigit()]

    def get_calibration(self, calibration_filename, calibration_dir):
        """
        Get calibration details (accuracy, precision).

        Parameters
        ----------
        filename: str
            Calibration filename
        calibration file directory: str
            path to calibration file directory
        
        Returns
        -------
        

        """
        self.calibration_filename = calibration_filename
        calibration_df = pd.read_csv(os.path.join(calibration_dir,calibration_filename))
        if calibration_df.loc[calibration_df['used']=='used','used'].empty:
            self.calibration = 'unused'
        else:
            self.calibration = calibration_df.loc[calibration_df['used'] == 'used',:]



(i) get time stamps from message file
- discard everything but the messages containing frame nr.
(ii) load data file and add column for frame nr
(iii) label rows between message (framenr) time stamps with the appropriate frame nr

In [13]:
# first we need to establish where to find the files and read them in.
rawfilepath = r"P:\Spironolactone\eye_tracking\Tobii"
# get merged Tobii files - these have both event info and eye gaze data.
Tobii_files = [f for f in os.listdir(rawfilepath) if 'merged' in f]
# get_calibration files
calibration_files = [f for f in os.listdir(rawfilepath) if 'calib' in f and f.endswith('.tsv')]
msg_files = [f for f in os.listdir(rawfilepath) if 'msg' in f and f.endswith('.tsv')]

In [62]:
# get pnum, left/right gaze point on display area, event messages 
# keep only columns between framenum 0 and the last frame
# set invalid samples to NaN (validity == 0)
frames = []
gaze_in_x = []
gaze_in_y = []
pnums = []

for i,filename in enumerate(Tobii_files):
    # read file
    tobii_file = pd.read_table(os.path.join(rawfilepath, filename))
    # get participant number and add as column
    tobii_file['pnum'] = np.repeat([int(s) for s in filename.split('_') if s.isdigit()], tobii_file.shape[0])
    # get row index of frame each frame presentation
    event_index = tobii_file.loc[tobii_file.msg.str.contains('FRAME',na = False),'msg'].index
    # drop everything before first frame index (ie frame 0) and after last frame
    tobii_file = tobii_file.loc[event_index[0]:event_index[-1],:]
    # extract frame number
    tobii_file.loc[event_index, 'msg'] = [f[0] for f in tobii_file.loc[event_index,'msg'].str.split(';', n = 1)]
    tobii_file['frame_num'] = tobii_file.loc[event_index,'msg'].apply(lambda x: re.findall(r'\d',x)).apply(''.join).astype('int')
    # fill the rows between event markers with the appropriate frame number
    tobii_file.loc[:,'frame_num'] = tobii_file.loc[:,'frame_num'].fillna(method = 'ffill')
    # drop event markers
    tobii_file = tobii_file.drop(labels = event_index,axis = 0)
    # discard invalid samples
    tobii_file = tobii_file.loc[(tobii_file.right_gaze_point_validity == 1)&(tobii_file.left_gaze_point_validity == 1),:]
    # group by frame number and calculate mean
    tobii_file = tobii_file.groupby('frame_num').mean()
    tobii_file = tobii_file.reset_index()
    # collect data into lists
    pnums.append(tobii_file.pnum.values)
    gaze_in_x.append(tobii_file.right_gaze_point_on_display_area_x.values)
    gaze_in_y.append(tobii_file.left_gaze_point_on_display_area_y.values)
    frames.append(tobii_file.frame_num.values)
    

We create one big dataframe from the above.
This is already ~ half a million rows for only 24 participants (with max 1 row per frame). This is going to become unwieldy for a significantly larger number of participants...may need to think about restructuring/parallel computing/cloud use

In [78]:
# create dataframe
et_df = pd.DataFrame({'id':flatten(pnums),'gaze_in_x':flatten(gaze_in_x),'gaze_in_y':flatten(gaze_in_y),'frame_num':flatten(frames)})

We now have valid data for left/right gaze positions (on display area) for all participants in a single data frame. We now want to get a measure of divergence. We need to bear in mind a couple of things.
First of all, calibration accuracy/precision will be different between participants. The first thing to check is to make sure that the precision is roughly the same. If not, this could really affect the divergence measure in a fairly unpredictable way, so we would need to exclude those subjects. Next, we want to have a look at accuracy. This is less of an issue because it is basically just an offset. We could try to correct gaze point data for each participant, but this will not be needed as divergent frames are identified based on comparing to a 'random' sequence of gaze data (which we will simulate below). Unless the offset is extreme, this should therefore not affect the results.
Because different participants are missing different frames and because we don't expect huge variation between individual frames, given a frame rate of ~30 fps, we will look at 250 ms windows, and we will use a sliding window approach, shifting the window 50 ms each time (resulting in 80% overlap between windows).
For each window and participant, we will calculate the Euclidean distance between the participant in question and every other participant/the participant in question and the random sequence. For any given window, we will then check whether this distance is greater in the former case than in the latter. If so, the window is identified as 'divergent' for that participant.

First, let's simulate x/y data by sampling from a normal distribution, based on each participant's gaze data.

In [138]:
%%timeit
# create columns to hold simulated values
et_df['gaze_in_x_sim'] = np.nan
et_df['gaze_in_y_sim'] = np.nan

# fill with simulated gaze data in x and y for each participant
for i,pnum in enumerate(et_df.id.unique()):
    pnum_df = et_df.loc[et_df.id == pnum,['id','gaze_in_x','gaze_in_y']]
    max_frame = pnum_df.shape[0]
    simx,simy = get_random_samps(pnum_df,max_frame,pnum)
    et_df.loc[et_df.id == pnum, 'gaze_in_x_sim'] = simx
    et_df.loc[et_df.id == pnum, 'gaze_in_y_sim'] = simy

268 ms ± 2.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# if we want to recreate 250 ms windows with a 50 ms shift, then we need to look at 15 samples at a time, and shift by 3
# get max number of windows
#stop_ind = et_df.frame_num.max()-15

#for i in(np.arange(0,stop_ind,3)):
    

# for each window and participant, first figure out the percentage of missing data in a given window.
# if >10%, set NaN.

In [174]:
from scipy.spatial import distance

# first, let's construct a dataframe for the x and y gaze positions that holds data from all participants.
gaze_x = et_df.pivot(index = 'frame_num',columns = 'id', values = 'gaze_in_x')
gaze_y = et_df.pivot(index = 'frame_num',columns = 'id', values = 'gaze_in_y')
gaze_x_sim = et_df.pivot(index = 'frame_num',columns = 'id', values = 'gaze_in_x_sim')
gaze_y_sim = et_df.pivot(index = 'frame_num',columns = 'id', values = 'gaze_in_y_sim')

for i, pnum in enumerate(et_df.id.unique()):
    frame_act = pd.DataFrame({'x':gaze_x.loc[:,pnum], 'y':gaze_y.loc[:,pnum]})
#et_df.loc[:,['frame_num','gaze_in_x','gaze_in_y']].groupby('frame_num').rolling(window = 15, min_periods = 3).apply(distance.euclidean)

In [175]:
gaze_y

id,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,20,21,23,24,25,26
frame_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.470329,0.518831,0.501687,,0.491041,0.472667,0.509080,0.709627,0.682568,,...,0.972367,0.511570,0.534296,,0.532877,,0.515072,0.541451,0.528649,
1.0,0.564896,0.527396,0.499992,,0.485849,0.469331,0.510573,0.718187,0.682286,,...,0.956490,0.541580,0.546304,,0.540626,,0.519624,0.542194,0.565981,
2.0,0.478805,0.523807,0.504027,,0.489227,0.470017,0.511068,0.701729,0.683303,,...,0.925161,0.556362,0.535093,,0.537728,,0.520258,0.536387,0.593265,
3.0,0.447756,0.521365,0.504930,0.533887,0.485558,0.470927,0.512056,,0.678651,,...,,0.549584,0.544182,,0.535042,,0.518806,0.540277,0.594708,
4.0,0.482590,0.518729,0.501289,0.520385,0.487901,0.468066,0.510920,0.614341,0.681754,,...,,0.546567,0.537078,,0.532966,,0.523836,0.539729,0.567019,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24744.0,0.485601,0.538325,0.530856,0.452097,0.424921,0.440020,0.527678,0.695945,0.604979,0.534278,...,0.507252,0.468647,0.490312,0.496327,0.482349,0.496616,0.499997,0.524720,0.322521,
24745.0,0.485444,0.534574,0.529286,0.450701,0.440218,0.444028,0.521384,0.695466,0.604251,0.526230,...,0.501755,0.474360,0.486934,0.493653,0.481731,0.495913,0.499571,0.514646,0.328214,
24746.0,0.496992,0.537616,0.528466,0.453763,0.430738,0.444538,0.529510,0.697995,0.601344,0.531610,...,0.493240,0.476332,0.487245,0.502314,0.481616,0.500057,0.502699,0.516043,0.325097,
24747.0,0.495039,0.537064,0.529382,0.456095,0.438591,0.442055,0.518159,0.700697,0.600067,0.519275,...,0.505096,0.471270,0.472297,0.509348,0.484370,0.499030,0.500089,0.515221,0.327979,
