In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.cluster import DBSCAN

In [291]:
def flatten(list_name):
    """ 
    Function that flattens list of lists.

    Parameters
    ----------
    list_name: str
        Name of the list to flatten
    
    Returns
    -------
    Flattened list
    """
    flattened = [item for sublist in list_name for item in sublist]
    return flattened

# some data points are invalid, ie they are <-1 or >1. This is usually because the Tobii has lost track of one or both eyes and the data points around the edges are odd.
# We will just set these values to the max value that is considered on screen (1/-1).
def set_to_min_max(gaze_df, gaze_dat_col, upper_bound, lower_bound):
    """
    Set eye tracking data to min/max value
    where gaze point data have been recorded
    as being off the screen (>1, <-1)

    Parameters
    ----------
    gaze_df:    pd DataFrame
        dataframe containing gaze point data
    gaze_dat_col:   list[str]
        name(s) of columns to be checked
        eg [gaze_in_x, gaze_in_y]
    upper_bound:    int
        value indicating max on screen coordinate
    lower_bound:    int
        value indicating min on screen coordinate
    
    Returns
    -------
    gaze_df with values capped at upper_bound/lower_bound
    """
    for col in gaze_dat_col:
        et_df.loc[:,col] = et_df[col].where(et_df[col].between(lower_bound,upper_bound),np.nan)
        num_inval_rows_after = et_df.shape[0]-et_df[et_df[col].between(lower_bound,upper_bound)].shape[0]
    return gaze_df

def remove_duplicates_patch(in_list):
    """
    patch to remove duplicates from
    list of lists.
    Uses set.

    Parameters
    ----------
    in_list:    list[list]
        list of lists with duplicate lists
    
    Returns
    -------
    list of lists w/o duplicates.
    """
    new_list = [
                list(mytuple)
                for mytuple in set(tuple(val) 
                for val in in_list)
                ]
    return new_list

class ODetect:
    """Trying out different ways of 
        detecting outliers in the gaze data"""

    def __init__(self, method_name,gaze_col_names):
        self.method_name = method_name
        self.gaze_cols = gaze_col_names
    
    def use_lof(self,nneigh, contam, X):
        """
        Outlier detection using LOF.
        
        Parameters
        ----------
        nneigh: int
            number of nearest neighbours
        contam: float or "auto"
            contamination
        X:  pd DataFrame
            input dataframe containing
            gaze data in x and y
        """
        clf = LocalOutlierFactor(n_neighbors = nneigh, contamination = contam)
        self.y_pred_lof = clf.fit_predict(X.loc[:,self.gaze_cols])
        self.X_scores_lof = clf.negative_outlier_factor_
        
    
    def use_dbscan(self,epsilon,min_points,X):
        """
        Use dbscan for outlier detection.

        Parameters
        ----------
        epsilon,min_points:    float,int
            dbscan parameters controlling
            density and minimum number of points
            in cluster
        X:  pd DataFrame
            input dataframe containing
            gaze data in x and y
        """
        dbscan = DBSCAN(eps = epsilon,min_samples = min_points).fit(X.loc[:,self.gaze_cols])
        self.y_pred_dbscan = dbscan.labels_

Read in the tobii files for all available participants. Remember to specify the correct file path and adjust the substrings filtered for in the list comprehensions, if needed.
As new participants will be added over time, one thing that's worth thinking about is to just run this script on the new participants. Since the files are pretty large, you my not want to reprocess data from participants whose data have already been processed to get the new features we wanted to use in the analysis later on.

In [None]:
home = 0
if home:
    rawfilepath = r"C:\Users\Luzia T\Eye-gaze-divergence"
    Tobii_files = [f for f in os.listdir(rawfilepath) if 'merged' in f]
    calibration_files = [f for f in os.listdir(rawfilepath) if 'calib' in f]
else:
    # first we need to establish where to find the files and read them in.
    rawfilepath = r"P:\Spironolactone\eye_tracking\Tobii"
    intrusions_dir = r"P:\Eye-gaze-divergence"
    # get merged Tobii files - these have both event info and eye gaze data.
    Tobii_files = [f for f in os.listdir(rawfilepath) if 'merged' in f]
    # get_calibration files
    calibration_files = [f for f in os.listdir(rawfilepath) if 'calib' in f and f.endswith('.tsv')]
    msg_files = [f for f in os.listdir(rawfilepath) if 'msg' in f and f.endswith('.tsv')]

In [23]:
# get pnum, left/right gaze point on display area, event messages 
# keep only columns between framenum 0 and the last frame
# set invalid samples to NaN (validity == 0)
frames = []
gaze_in_x = []
gaze_in_y = []
pnums = []
# threshold for missing data. If more than missing_thresh gaze point data missing,
# do not use record for this participant.
missing_thresh = 0.3

print("Now processing tobii files....")
for i,filename in enumerate(Tobii_files):
    # read file
    tobii_file = pd.read_table(os.path.join(rawfilepath, filename))
    # get participant number and add as column
    tobii_file['pnum'] = np.repeat([int(s) for s in filename.split('_') if s.isdigit()], tobii_file.shape[0])
    # get row index of frame each frame presentation
    event_index = tobii_file.loc[tobii_file.msg.str.contains('FRAME',na = False),'msg'].index
    # drop everything before first frame index (ie frame 0) and after last frame
    tobii_file = tobii_file.loc[event_index[0]:event_index[-1],:]
    # extract frame number from event time stamps
    tobii_file.loc[event_index, 'msg'] = [f[0] for f in tobii_file.loc[event_index,'msg'].str.split(';', n = 1)]
    tobii_file['frame_num'] = tobii_file.loc[event_index,'msg'].apply(lambda x: re.findall(r'\d',x)).apply(''.join).astype('int')
    # fill the rows between event markers with the appropriate frame number
    tobii_file.loc[:,'frame_num'] = tobii_file.loc[:,'frame_num'].fillna(method = 'ffill')
    # drop event markers
    tobii_file = tobii_file.drop(labels = event_index,axis = 0)
    tobii_file = tobii_file.groupby('frame_num').mean()
    tobii_file = tobii_file.reset_index()
    # check if participant has excessive missing data
    if tobii_file["left_gaze_point_on_display_area_x"].isna().sum()/tobii_file.shape[0]>missing_thresh:
        print(f"Participant {tobii_file.pnum.unique()} has >30% missing data. Omitting.")
    # collect data into lists
    pnums.append(tobii_file.pnum.values)
    gaze_in_x.append(tobii_file.left_gaze_point_on_display_area_x.values)
    gaze_in_y.append(tobii_file.left_gaze_point_on_display_area_y.values)
    frames.append(tobii_file.frame_num.values)
    print(f"{round(((i+1)/len(Tobii_files))*100,2)}% done.", end = "\r")
    
# create dataframe
et_df = pd.DataFrame({'id':flatten(pnums),'gaze_in_x':flatten(gaze_in_x),'gaze_in_y':flatten(gaze_in_y),'frame_num':flatten(frames)})

Now processing tobii files....
Participant [13] has >30% missing data. Omitting.
Participant [18] has >30% missing data. Omitting.
Participant [26] has >30% missing data. Omitting.
100.0% done.

Here, I didn't restrict the rows to only those with a "1" validity rating. The reason for this was that I wanted to retain all frames for all participants, as I would have had to manually insert nans later on due to the approach I chose. You can of course go back to filtering out invalid rows, but here I included them. When doing this, one thing that's important to remember is to set both x and y coordinates to nan, even if only x or y is nan. This sometimes happens when the eye tracker is just about to lose track of the eyes/when it is just about to find them again, but the data are typically not trustworthy and it's best to exclude.

In [24]:
# if gaze in x is nan, set gaze in y nan & vice versa.
et_df.loc[(et_df.gaze_in_x.isnull())|(et_df.gaze_in_y.isnull()),["gaze_in_x","gaze_in_y"]] = np.nan
# calculate % missing data (this will be for all participants and frames, *not* per participant)
perc_overall_missing = et_df[et_df.gaze_in_x.isnull()].shape[0]/et_df.shape[0]
print(f"Across frames and participants, {round(perc_overall_missing*100,2)}% gaze point data are missing.")

Across frames and participants, 17.16% gaze point data are missing.


Another thing that happens is that data may exceed the maximum or minimum screen value (1,-1). This also typically happens when the eye tracker has lost one or both eyes. For these values, we will set them to the min/max. Later on, I will also set nan vals to -1. I did this because I thought in this case it was the best approach to deal with missing values. For shorter sections (that likely correspond to blinks), some form of imputation may be appropriate. However, overall we want to know when people likely weren't looking at the screen, or at least not where most other people were looking at that time. As -1,-1 would be an extreme score unlikely to be classed as not outlying, this should work pretty well in detecting off-screen gaze/closed eyes. A note on blinking: Since everyone blinks (albeit at different times) and we are later going to use the number of 'outlying' gaze points across frames of the video, blinks are unlikely to sway the results of a classifier or regression model. 

In [25]:
# remove data points that are <-1 or >1 (this would correspond to gaze data that is off screen)
# usually this happens shortly before/after the eye tracker loses track of the eyes
et_df = set_to_min_max(et_df, ["gaze_in_x","gaze_in_y"],1,-1)

Let's calculate the median gaze path across the individual video frames. Assuming that the vast majority of participants will have looked at roughly the same areas of the screen throughout, this will help us in getting an idea of the 'typical' gaze path within our sample. Note that some changes in gaze position are expected as the action shifts from one place to another.

In [8]:
# calculate median per frame for x and y gaze positions.
median_et_df = et_df.loc[:,["frame_num","gaze_in_x","gaze_in_y"]].groupby("frame_num").median().reset_index()
# get the rolling average - this just smoothes things out a bit.
median_et_df["rolling_x"]= median_et_df.loc[:,["gaze_in_x"]].rolling(90).mean()
median_et_df["rolling_y"] = median_et_df.loc[:,["gaze_in_y"]].rolling(90).mean()

In [9]:
# plot
fig = px.line(median_et_df, x = "frame_num", y = ["gaze_in_x","gaze_in_y","rolling_x","rolling_y"], template = "plotly_dark")
fig.show()

Now we'll replace nans with -1s. See above for reasoning.

In [33]:
et_df = et_df.replace(np.nan, -1)
# check that no nans remain
print(f"Number of NaNs in each column after processing:\n{et_df.isna().sum()}")

Number of NaNs in each column:
id           0
gaze_in_x    0
gaze_in_y    0
frame_num    0
dtype: int64


We are now going to identify outlying gaze data for each frame. Here, I'm using local outlier factor to do this. I have also tried DBSCAN (a clustering method), but had concerns that epsilon would need tuning, but was unsure just how much and how often so would have had to write separate functions to do that, adding complexity.
I played around with it a little bit and LOF seemed to work "better" overall.
As an output, we get the negative outlier factor and we also get labels (-1 for outlier). This is what we're going to be using later on to create our feature.

In [396]:
ofscores = []
pnums = []
ypreds = []
frame_nums = []
#od = ODetect("dbscan", ["gaze_in_x","gaze_in_y"])
od = ODetect("lof", ["gaze_in_x","gaze_in_y"])

for frame in et_df.frame_num[500:].unique():
    X = et_df.loc[et_df.frame_num == frame,:]
    od.use_lof(15,"auto",X)
    #y_pred = od.y_pred_dbscan#dbscan.labels_
    y_pred = od.y_pred_lof
    X_scores = od.X_scores_lof
    frames = np.tile(frame, len(y_pred))
    frame_nums.append(frames)
    ypreds.append(y_pred)
    ofscores.append(X_scores)
    pnums.append(X.id)

Put everything into a dataframe.

In [401]:
my_dict = {
            "frame_num": [vals for sublist in frame_nums for vals in sublist], # can use np.ravel here as well
            "id":[vals for sublist in pnums for vals in sublist],
            #"ofscores":[vals for sublist in ofscores for vals in sublist],
            "labels":[vals for sublist in ypreds for vals in sublist]
            }
scores_labels_df = pd.DataFrame(data = my_dict)

We now want to use the information about outlier status in our remaining analysis. One question we might ask is whether the number of frames during which a participant's gaze data was labelled as an outlier is linked to the number of/vividness and/or distress associated with intrusions. As such, we can generate a new feature that gives us the number of 'outlier' labels for each participant across all frames of the video.

In [402]:
# get the number of outlier labels for each participant
num_of_labels_df = scores_labels_df[
                                    scores_labels_df.labels == -1
                                    ].groupby("id")["labels"].sum()
# take the absolute value
num_of_labels_df = num_of_labels_df.abs()
print(num_of_labels_df.describe())

count       24.000000
mean      5255.375000
std       6535.609587
min        490.000000
25%       1210.500000
50%       2688.500000
75%       4979.250000
max      22396.000000
Name: labels, dtype: float64


We can try visualizing the results for individual frames. Feel free to choose a few different frames and have a look at the points identified as outliers.

In [405]:
x2 = scores_labels_df[scores_labels_df.frame_num == 10394]
x2 = x2.join(et_df.loc[et_df.frame_num == 10394, ["id","gaze_in_x","gaze_in_y"]].set_index("id"),on = "id")
print(x2.head())

        frame_num  id  labels  gaze_in_x  gaze_in_y
237456    10394.0  10       1   0.604681   0.591334
237457    10394.0  11       1   0.475530   0.567799
237458    10394.0  12       1   0.530954   0.570619
237459    10394.0  13      -1  -1.000000  -1.000000
237460    10394.0  14       1   0.576889   0.554961


In [406]:
fig = px.scatter(x2, x = "gaze_in_x", y = "gaze_in_y", color = "labels" ,template = "plotly_dark")
fig.show()