# Subject Data PreProcessing
* find subject data
* concatenate the inter-session data
* concatenate the intra-session data

# TO DO
* Adapt the cleaning cell to also consider empty data points and mark them as noData in the DataFrame
* Write gaze definition cell by counting identical collider appearances while 21 consecutive hits are then defined as a gaze caused by an actual fixation. 


## Configuration

In [1]:
# General configuration
import os

# data_directory: str
#     Path to a directory to store data.
data_directory = '.'

# install_missing_packages: bool
#     A flag indicating if missing packages should be automatically installed
install_missing_packages = True

# use_conda: bool
#     A flag indicating if conda should be used for software installation.
#     If False, pip will be used. The default is to use conda if jupyter
#     is run in a conda environment.
use_conda = 'CONDA_EXE' in os.environ

## Checking for missing packages

In [2]:
import importlib

def check_package(package, pip_pkg: str = None, conda_pkg: str = None):
    """Check if a given package is installed. If missing install
    it (if global flag `install_missing_packages` is True) either with
    pip or with conda (depending on `use_conda`).
    """
    if importlib.util.find_spec(package) is not None:
        return  # ok, package is already installed

    if not install_missing_packages:
        raise RuntimeError(f"{package} is not installed!")

    if use_conda:
        import conda.cli
        conda.cli.main('conda', 'install',  '-y', conda_pkg or package)
    else:
        import subprocess
        import sys            
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pip_pkg or package])
        
# This is to exit cells without error tracebacks (cosmetic purpose)
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

## Creating the required environment (skip if already done)

Running the following cell will create a file graphs.yml that can be used to setup a conda environment containing the required packages. If you already downloaded the file from my GitHub, skip the next cell and create the env directly from it.

In [3]:
%%writefile graphs.yml
name: graphs
channels:
  - conda-forge
  - defaults
dependencies:
  - python=3.6
  - jupyter
  - imageio
  - imageio-ffmpeg
  - matplotlib
  - scikit-image
  - opencv
  - networkx
  - pandas
  - statsmodels

Writing graphs.yml


### Environment Creation
To create the environment, open the terminal, go to the directory where you stored the graphs.yml file (the directory of the notebook) and type
conda env create -f graphs.yml
After running this command you have to activate the environment (Linux/MacOS: conda activate graphs, Windows: activate graphs) and then reopen the notebook in that environment.

## Main Part

### Imports and directory information

In [2]:
import os
import json
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nX
import glob
import scipy.cluster.vq as clusters
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import normalize
from pandas.plotting import autocorrelation_plot as AC_plot 
#from statsmodels.graphics import tsaplots
#from statsmodels.tsa.stattools import acf
from skimage.filters import gaussian
from mpl_toolkits.mplot3d import Axes3D 
import time


OG_DATA_PATH = './'
DATA_PATH = './Data Single/'

PROCESSED_DATA_PATH = './Results/'


### Extracting all subject IDs from the data folder

In [3]:
# Getting the Folder without hidden files in ascending order 
DATA_FOLDER = sorted([f for f in os.listdir(DATA_PATH) if not f.startswith('.')], key=str.lower)

subIDs = []
for sub in DATA_FOLDER:
    if sub[0].isdigit():
        subIDs.append(int(sub[0:4]))
    else:
        pass
subIDs = np.unique(subIDs)
print(subIDs)

[1023]


# Combining Single Pathfinding Experiment Files per Subject

In [11]:
EXPT_FILE_ATTRS = ['ParticipantID', 'PathsReversed', 'IsDyadic', 'IsLeader'];
subIDs = [1023] # remove to do for all subIDs

def map_dict(a_dictionary):
    a_subset = {key: value for key, value in a_dictionary.items() if key in EYE_FILE_ATTRS};
    return a_subset;

# For each subject in subject ID folder, combine all experiment files
# and save as a unified final experiment file in Result folder
for sub in subIDs:
    # Get all files for subject
    SUB_FILES = list(filter(lambda x: str(sub) in x, DATA_FOLDER));
    
    # Sort files into experiment files and eyetracking files
    EYETRACKING_FILES = list(filter(lambda x: 'EyeTracking' in x, SUB_FILES));
    EYETRACKING_FILES.sort();
    
    EXPT_FILES = list(filter(lambda x: 'SinglePathFinding' in x, SUB_FILES));
    EXPT_FILES.sort();
    
    EXPT_DATA = {};
        
    # For each experiment file
    for expt_file in EXPT_FILES:
        
        # Read JSON data
        with open(DATA_PATH + expt_file) as f:
            try:
                subject_session = json.loads(f.read())
            except:
                print("\tJSON file " + expt_file + " is not valid!")
                continue;
        
        # If there are more than one experiment files, combine the data of the paths if the 
        # other trial and subject information matches
        if len(EXPT_DATA) > 0:
            if(not all([subject_session[x] == EXPT_DATA[x] for x in EXPT_FILE_ATTRS])):
                raise Exception("Experiment File Data " + expt_file + " Does Not Match!")
            else:
                EXPT_DATA['Concatenated'] = True;
                EXPT_DATA['TrialData'] += subject_session['TrialData'];
        
        else:
            EXPT_DATA = subject_session;
            EXPT_DATA['Concatenated'] = False;
        
    # Check if correct number of paths present in experiment file
    if len(EXPT_DATA['TrialData']) != 10:
        raise Exception("Incorrent number of paths in file for Subject " + str(sub));
        break;
        
    # Save the final combined file in results to be accessed by eyetracking collection script
    expt_file_name = PROCESSED_DATA_PATH + str(sub) + "_SinglePathfinding_Final.json";
    with open(expt_file_name, 'w') as fp:
        json.dump(EXPT_DATA, fp);
    

### Combine the data
* Loop through all subjects
* extract the session data
* combine the data
* save it

In [37]:
# --------- Preparation ---------

Session_save_bool = False # set to True if you want to save each individual session as csv
Exploration_save_bool = True # set to True if you want to save the complete exploration as csv
subcount = 0

# column name list for dataframe
col_names =  ['PathNumber',
          'timeStampDataPointStart',
          'timeStampDataPointEnd',
          'hitObjectColliderName', 
          'ordinalOfHit',
          'BitMask',
          'hitPointOnObject.x',
          'hitPointOnObject.y',
          'hitPointOnObject.z',
          'hitObjectColliderBoundsCenter.x',
          'hitObjectColliderBoundsCenter.y',
          'hitObjectColliderBoundsCenter.z']


NoHit_dict = {'hitPointOnObject': {'x': 'NaN',
                                   'y': 'NaN',
                                   'z': 'NaN'},
              'hitObjectColliderName': 'NoHit',
              'hitObjectColliderBoundsCenter': {'x': 'NaN',
                                                'y': 'NaN',
                                                'z': 'NaN'},
              'ordinalOfHit': 'NaN',
              'Session': 'NaN',
              'timeStampDataPointStart': 'NaN',
              'timeStampDataPointEnd': 'NaN',
              'BitMask': 'NaN'
              }



# --------------------------- MAIN PART ---------------------------


# --------- first layer - subject loop ---------

for subject in subIDs:
    subcount +=1
    print('Subject ' + str(subject) + ' started - ' + str(subcount) + '/' + str(len(subIDs)))
    
    # Create empty dataframe for later concatenation
    complete_exploration_df = pd.DataFrame(columns = col_names)
    
    # get the data files according to the subject
    subject_folder = sorted([f for f in DATA_FOLDER 
                             if f.startswith(str(subject)+'_EyeTrackingData_')], 
                            key=str.lower) 
    
    # the following works as long as the data name format is as follows:
    # 'subjectID'_Expl_S_'SessionNumber'_ET_'EyeTrackingSessionNumber'_'UnixTimestamp'.json
    folder_files = list()
       
    # loop through the subject folder and save all numbers
    for file in subject_folder:
        folder_files.append(re.findall(r'\d+', file))
    
    # Extract all SubIDs (only one), (and Timestamps)
    SubID, UnixTimestamp1, _ = map(list, zip(*folder_files))
    
    # Get the experiment JSON file for the subject
    sub_expt_data = {}
    sub_expt_file_name = PROCESSED_DATA_PATH + str(sub) + "_SinglePathfinding_Final.json";
    try:
        with open(sub_expt_file_name, 'r') as fp:
            sub_expt_data = json.load(fp);
    except:
        raise Exception("Could not read experiment file for subject " + str(sub));
    
# --------- second layer - exploration session loop ---------
    s = 0;
    complete_hitpoints_df = pd.DataFrame(columns = col_names)
    # loop over separate eye tracking sessions
    for ET_session in subject_folder:
        s+=1
        #print("\tSession " + str(s) + " started")

            # open the JSON file as dictionary
        with open(DATA_PATH + ET_session) as f:
            try:
                subject_session = json.loads(f.read())
            except:
                print("\tJSON file " + ET_session + " is not valid!")

        hitpoint_list = list() # create hitpoint list
        
        # check if trial is valid:
        trial_valid = subject_session['trials'][0]['trialIsValid']
        if not trial_valid:
            raise Exception("Excluding " + ET_Session + " because trial is not valid!");
            continue;
        
        sub_id = subject_session['trials'][0]['participantId']
        if str(sub_id) != str(subject):
            raise Exception("Excluding " + ET_Session + " because subject IDs don't match!");
            continue;
        
        # start timestamp of the session 
        start_time = subject_session['trials'][0]['timeTrialMeasurementStarted']
        
        # get trial id (path number)
        path_num = subject_session['trials'][0]['trialId']
        
        expt_path_start_time = sub_expt_data['TrialData'][path_num]['StartMachineTimeStamp'];
        expt_path_end_time = sub_expt_data['TrialData'][path_num]['EndMachineTimeStamp'];

        # amount of datapoints 
        Len_subses = len(subject_session['trials'][0]['dataPoints'])

        # for loop appending each data point rayCastHit Data
        # afterwards adding the timestamp to the dict 
        # passing if there is (1) no raycast hit and (2) if there is only one raycast hit
        for each in subject_session['trials'][0]['dataPoints']:

            # account for noHits 
            if each['rayCastHitsCombinedEyes'] == []:
                hitpoint_list.append(NoHit_dict)
            else:
                # append data point
                hitpoint_list.append(each['rayCastHitsCombinedEyes'][0]) 
                # add path number, timestamp and bitmask
                idx = len(hitpoint_list)-1
                hitpoint_list[idx]['PathNumber'] = path_num
                
                # Exclude the data point if it lies outside of the times for a path
                if each['timeStampDataPointStart'] < expt_path_start_time or each['timeStampDataPointStart'] > expt_path_end_time:
                    hitpoint_list[idx]['PathNumber'] += 100;
#                     print(str(idx) + " Data Pt. Start: " + str(each['timeStampDataPointStart']));
#                     print(str(idx) + " Data Pt. End: " + str(each['timeStampDataPointEnd']));
#                     print("Path Start: " + str(expt_path_start_time));
#                     print("Path End: " + str(expt_path_end_time));
#                     print('\n')
                    
                hitpoint_list[idx]['timeStampDataPointStart'] = each['timeStampDataPointStart'] - start_time
                hitpoint_list[idx]['timeStampDataPointEnd'] = each['timeStampDataPointEnd'] - start_time
                hitpoint_list[idx]['BitMask'] = each['combinedGazeValidityBitmask']


            try: 
                # append data point of second raycast hit if it exists
                hitpoint_list.append(each['rayCastHitsCombinedEyes'][1])
                # add Path number, timestamp and bitmask
                idx = len(hitpoint_list)-1
                hitpoint_list[idx]['PathNumber'] = hitpoint_list[idx-1]['PathNumber'];
                hitpoint_list[idx]['timeStampDataPointStart'] = each['timeStampDataPointStart'] - start_time
                hitpoint_list[idx]['timeStampDataPointEnd'] = each['timeStampDataPointEnd'] - start_time
                hitpoint_list[idx]['BitMask'] = each['combinedGazeValidityBitmask']

            except:
                pass

        # normalize the hitpoint dictionary to get dataframe
        hitpoints_df = pd.json_normalize(hitpoint_list)

        print("\tET: " + str(s) + " normalized")

        complete_hitpoints_df = complete_hitpoints_df.append(hitpoints_df)


    # --------- Saving each Session ---------

    # If you want to save each session separately, set 'Session_save_bool' to True
    if Session_save_bool == True:
        try:
            if len(subject_data) > 0:
                complete_hitpoints_df.to_csv(PROCESSED_DATA_PATH
                                             + str(subject)
                                             + "_CompleteSession"
                                             + "_SinglePF"
                                             + "_Hitpoints.csv")
                print("\t"
                      + str(subject)
                      + " session "
                      + "_SinglePF"
                      + " saved ")
            else: 
                print("\t"
                      + str(subject)
                      + " - Session "
                      + "_SinglePF"
                      + " is empty!")
        except:
            print("\tCould not save subject "
                  + str(subject)
                  + " session "
                  + "_SinglePF"
                  + "!")



    # fill the complete exploration dataframe with the separate session data (combining the sessions)
    complete_exploration_df = complete_exploration_df.append(complete_hitpoints_df)
        
    # --------- Saving the Exploration ---------
    
    # If you want to save the exploration file, set 'Exploration_save_bool' to True
    if Exploration_save_bool == True:
        # saving the complete exploration
        try:
            complete_exploration_df.to_csv(PROCESSED_DATA_PATH + str(subject) + "_CompleteSingle_Hitpoints.csv")
            print("\t" + str(subject) + " single eyetracking data saved")
        except:
            print("\tCould not save subject " + str(subject) + " single eyetracking data!")
    

print('Done')

Subject 1023 started - 1/1
0 Data Pt. Start: 1620115810.2498112
0 Data Pt. End: 1620115810.255763
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4362 Data Pt. Start: 1620115833.9377499
4362 Data Pt. End: 1620115833.9407258
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4364 Data Pt. Start: 1620115833.9407258
4364 Data Pt. End: 1620115833.9407258
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4366 Data Pt. Start: 1620115833.958086
4366 Data Pt. End: 1620115833.9610617
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4367 Data Pt. Start: 1620115833.9610617
4367 Data Pt. End: 1620115833.9610617
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4368 Data Pt. Start: 1620115833.9694939
4368 Data Pt. End: 1620115833.9724698
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4369 Data Pt. Start: 1620115833.9724698
4369 Data Pt. End: 1620115833.9724698
Path Start: 1620115810.2562592
Path End: 1620115833.936262


4370 Dat

	ET: 2 normalized
0 Data Pt. Start: 1620116162.3984616
0 Data Pt. End: 1620116162.4019332
Path Start: 1620116162.4019332
Path End: 1620116225.7698226


12570 Data Pt. Start: 1620116225.7708144
12570 Data Pt. End: 1620116225.7737904
Path Start: 1620116162.4019332
Path End: 1620116225.7698226


12572 Data Pt. Start: 1620116225.7737904
12572 Data Pt. End: 1620116225.7737904
Path Start: 1620116162.4019332
Path End: 1620116225.7698226


12574 Data Pt. Start: 1620116225.7812304
12574 Data Pt. End: 1620116225.7847023
Path Start: 1620116162.4019332
Path End: 1620116225.7698226


12576 Data Pt. Start: 1620116225.7921426
12576 Data Pt. End: 1620116225.7956142
Path Start: 1620116162.4019332
Path End: 1620116225.7698226


12578 Data Pt. Start: 1620116225.8035505
12578 Data Pt. End: 1620116225.8065264
Path Start: 1620116162.4019332
Path End: 1620116225.7698226


12580 Data Pt. Start: 1620116225.8144624
12580 Data Pt. End: 1620116225.8174384
Path Start: 1620116162.4019332
Path End: 1620116225.769822

	ET: 4 normalized
0 Data Pt. Start: 1620116343.524044
0 Data Pt. End: 1620116343.52702
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83656 Data Pt. Start: 1620116738.0672493
83656 Data Pt. End: 1620116738.0702252
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83658 Data Pt. Start: 1620116738.0702252
83658 Data Pt. End: 1620116738.0702252
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83660 Data Pt. Start: 1620116738.0786572
83660 Data Pt. End: 1620116738.081633
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83662 Data Pt. Start: 1620116738.081633
83662 Data Pt. End: 1620116738.081633
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83664 Data Pt. Start: 1620116738.0890734
83664 Data Pt. End: 1620116738.0920491
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83666 Data Pt. Start: 1620116738.1004815
83666 Data Pt. End: 1620116738.1039531
Path Start: 1620116343.52702
Path End: 1620116738.0662575


83668 Data Pt. S

	ET: 6 normalized
0 Data Pt. Start: 1620116836.426407
0 Data Pt. End: 1620116836.4293828
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21666 Data Pt. Start: 1620116941.788084
21666 Data Pt. End: 1620116941.79106
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21668 Data Pt. Start: 1620116941.79106
21668 Data Pt. End: 1620116941.79106
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21670 Data Pt. Start: 1620116941.798996
21670 Data Pt. End: 1620116941.801972
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21672 Data Pt. Start: 1620116941.809908
21672 Data Pt. End: 1620116941.8128839
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21674 Data Pt. Start: 1620116941.821316
21674 Data Pt. End: 1620116941.824292
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21676 Data Pt. Start: 1620116941.824292
21676 Data Pt. End: 1620116941.824292
Path Start: 1620116836.4293828
Path End: 1620116941.787092


21678 Data Pt. Sta

	ET: 8 normalized
0 Data Pt. Start: 1620117004.37279
0 Data Pt. End: 1620117004.375766
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20311 Data Pt. Start: 1620117104.6832094
20311 Data Pt. End: 1620117104.6861856
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20313 Data Pt. Start: 1620117104.6861856
20313 Data Pt. End: 1620117104.6861856
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20315 Data Pt. Start: 1620117104.7045376
20315 Data Pt. End: 1620117104.7080092
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20317 Data Pt. Start: 1620117104.7080092
20317 Data Pt. End: 1620117104.7080092
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20319 Data Pt. Start: 1620117104.7268574
20319 Data Pt. End: 1620117104.7298331
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20321 Data Pt. Start: 1620117104.7298331
20321 Data Pt. End: 1620117104.7298331
Path Start: 1620117004.375766
Path End: 1620117104.6822176


20323 

	ET: 10 normalized
	1023 single eyetracking data saved
Done


In [38]:
Result_df = pd.read_csv(PROCESSED_DATA_PATH+'1023_CompleteSingle_Hitpoints.csv')

In [39]:
Result_df

Unnamed: 0.1,Unnamed: 0,PathNumber,timeStampDataPointStart,timeStampDataPointEnd,hitObjectColliderName,ordinalOfHit,BitMask,hitPointOnObject.x,hitPointOnObject.y,hitPointOnObject.z,hitObjectColliderBoundsCenter.x,hitObjectColliderBoundsCenter.y,hitObjectColliderBoundsCenter.z,Session
0,0,100.0,0.001984,0.007936,StartZoneA0,1.0,3.0,251.315308,-0.877272,170.721466,251.851456,-2.287690,171.080597,
1,1,100.0,0.001984,0.007936,road_base_network.004,2.0,3.0,252.663849,-2.399850,172.152802,39.257874,-4.408301,51.768860,
2,2,0.0,0.008432,0.008432,StartZoneA0,1.0,3.0,251.315308,-0.877272,170.721466,251.851456,-2.287690,171.080597,
3,3,0.0,0.008432,0.008432,road_base_network.004,2.0,3.0,252.663849,-2.399850,172.152802,39.257874,-4.408301,51.768860,
4,4,0.0,0.011904,0.014880,road_base_network.004,1.0,3.0,252.691284,-2.400710,172.184113,39.257874,-4.408301,51.768860,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248447,18489,109.0,90.206415,90.206415,terrain_C.001,2.0,3.0,-282.912537,-0.626187,-180.557816,-491.456299,18.698456,-597.844604,
248448,18490,109.0,90.225760,90.228735,pavement_C.002,1.0,3.0,-282.881683,-0.488985,-180.404495,-217.756897,-2.905127,-288.595245,
248449,18491,109.0,90.225760,90.228735,terrain_C.001,2.0,3.0,-282.919189,-0.627347,-180.598740,-491.456299,18.698456,-597.844604,
248450,18492,109.0,90.228735,90.228735,pavement_C.002,1.0,3.0,-282.881683,-0.488985,-180.404495,-217.756897,-2.905127,-288.595245,


In [40]:
print("data points excluded for 1023: ");
(Result_df['PathNumber'] > 20).value_counts()


data points excluded for 1023: 


False    247808
True        644
Name: PathNumber, dtype: int64