This is a notebook showing how the final data used for data analysis and hypothesis testing is build from all the available data files stored in eye_data and input_data respectively.

In [8]:
import os
import math
import numpy as np
import pandas as pd
from ast import literal_eval
import itertools
import arviz as az
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as  mpatches
pd.options.mode.chained_assignment = None  # default='warn'

from helper_functions import *

agent_center_x = 972
agent_center_y = 288

We first identify all data files that belong to a successfull play-through of a trial (indicated by done string in file name). We only take successfull runs for better comparability.

In [2]:
root_dir = os.getcwd()

data_dir = "/experimental_data/"

target_string = "output"
target_string_eye_tracking = "eye_tracking"
done_string = "done"

successfull_runs = []

for subdir, dirs, files in os.walk(root_dir+data_dir):
    for file in files:
        if done_string in file:
            successfull_runs.append(file)

# all data files have in their names a triplet for identification...       
# triplet coding: level, drift (True vs. False), input noise (None vs. Weak vs. Strong)

In [3]:
successfull_runs

['OK01UE_output_5FN_done_10.csv',
 'OK01UE_output_3FN_done_25.csv',
 'OK01UE_output_6FN_done_16.csv',
 'OK01UE_output_2FS_done_27.csv',
 'OK01UE_output_4FS_done_07.csv',
 'OK01UE_output_4FW_done_36.csv',
 'OK01UE_output_3TS_done_14.csv',
 'OK01UE_output_1FW_done_06.csv',
 'OK01UE_output_6FW_done_02.csv',
 'OK01UE_output_5FS_done_40.csv',
 'OK01UE_output_4TN_done_21.csv',
 'OK01UE_output_1TS_done_32.csv',
 'OK01UE_output_0TS_done_00.csv',
 'OK01UE_output_3FW_done_22.csv',
 'OK01UE_output_3TN_done_00.csv',
 'OK01UE_output_1FN_done_47.csv',
 'OK01UE_output_6TW_done_50.csv',
 'OK01UE_output_5FW_done_30.csv',
 'OK01UE_output_2TS_done_39.csv',
 'OK01UE_output_2FN_done_41.csv',
 'OK01UE_output_5TS_done_15.csv',
 'OK01UE_output_2TN_done_20.csv',
 'OK01UE_output_1FS_done_48.csv',
 'OK01UE_output_2FW_done_18.csv',
 'OK01UE_output_1TN_done_33.csv',
 'OK01UE_output_3TW_done_28.csv',
 'OK01UE_output_2TW_done_31.csv',
 'OK01UE_output_4TW_done_13.csv',
 'OK01UE_output_3FS_done_19.csv',
 'OK01UE_outpu

split successfull_runs by _ and get code ([0]) of trial ([-1]; - cut of .csv). Then use this to read eye_tracking data.

In [9]:
snippets = []

for file_name in successfull_runs:
    temp = file_name.split("_")
    code = temp[0]
    exp_trial = temp[-1]
    exp_trial = exp_trial[:2]  # cut off .csv
    
    snippets.append([code, exp_trial])

Convert snippets to pandas df

In [10]:
snippets_df = pd.DataFrame(snippets, columns = ['code', 'trial'])
snippets_df

Unnamed: 0,code,trial
0,OK01UE,10
1,OK01UE,25
2,OK01UE,16
3,OK01UE,27
4,OK01UE,07
...,...,...
854,UD06AD,16
855,UD06AD,13
856,UD06AD,35
857,UD06AD,45


## Building eye-movement dataframe

In [11]:
# filter the content for trials in snippets df
eye_data_successfull_runs = pd.DataFrame()

# slow but explicit for loop...
for id_code in np.unique(snippets_df.code):
    successfull_runs = list(snippets_df.loc[snippets_df['code'] == id_code].trial)
    
    path = root_dir + data_dir + str(id_code) + "/eye_data"
    for subdir, dirs, files in os.walk(path):
        for data_file in files:
            if ".csv" in str(data_file):

                temp = str(data_file).split("_")
                # extract features of run from file_name coding
                level = temp[4][0]
                drift = temp[4][1]
                input_noise = temp[4][2]
                exp_trial = temp[-1]
                exp_trial = exp_trial[:2]
                # check for trial number of successfull trials
                #print(exp_trial, successfull_runs)
                if exp_trial in successfull_runs:

                    #f"experimental_data/{id_code}/eye_data/{data_file}"
                    temp_data = pre_process_eye_data(pd.read_csv((f"experimental_data/{id_code}/eye_data/{data_file}"), index_col=False))
                    temp_data["ID"] = id_code
                    temp_data["level"] = level
                    temp_data["drift"] = drift
                    temp_data["input_noise"] = input_noise

                    eye_data_successfull_runs = pd.concat([eye_data_successfull_runs, temp_data])

72460872 rows of data generated for final data set.

In [22]:
def func(x):
    if x == 1:
        return "easy"
    elif x == 2:
        return "easy"
    elif x == 3:
        return "medium"
    elif x == 4:
        return "medium"
    elif x == 5:
        return "hard"
    elif x == 6:
        return "hard"
    else:
        return 'N/A'

#eye_data_successfull_runs["level_difficulty"] = eye_data_successfull_runs.level.apply(func)

# Calling func for the sub data frames for fixations and saccades individually is more efficient (less time consuming).

### Extracting fixations

In [14]:
fix_data = eye_data_successfull_runs[eye_data_successfull_runs.fixationOnset == 1].reset_index()

In [20]:
N_visible_obs_col = []
N_visible_drift_col = []
Dist_to_closest_obstacle_col = []

for index, row in fix_data.iterrows():
    
    input_file_name = f'{row.ID}_output_{row.level}{str(row.drift)[0]}{row.input_noise}_done_'
    
    path = f'experimental_data/{row.ID}/data/'
    
    file_found = False
    
    for subdir, dirs, files in os.walk(path):
        for data_file in files:
            if input_file_name in str(data_file):
                
                file_found = True
                
                #print(f'searching for {data_file}')
                
                input_data = pre_process_input_data(pd.read_csv(f'{path}/{data_file}', index_col=False))
                
                closest_frame = input_data.iloc[(input_data['time_played'] - row.time_tag).abs().argsort()[0]]
                
                distances = []
                for obstacle in closest_frame.visible_obstacles:
                    dist_squared = np.power(np.abs(obstacle[0] - agent_center_x), 2) + np.power(np.abs(obstacle[1] - agent_center_y), 2)
                    distances.append(np.sqrt(dist_squared))
                
                if len(distances) > 0:
                    Dist_to_closest_obstacle = np.min(distances)
                elif len(distances) == 0:
                    Dist_to_closest_obstacle = np.nan
                    
                N_visible_obstacles = len(closest_frame.visible_obstacles)
                N_visible_drift = len(closest_frame.visible_drift_tiles)
                
                N_visible_obs_col.append(N_visible_obstacles)
                N_visible_drift_col.append(N_visible_drift)
                Dist_to_closest_obstacle_col.append(Dist_to_closest_obstacle)
    
    if not file_found:
        N_visible_obs_col.append(np.nan)
        N_visible_drift_col.append(np.nan)
        Dist_to_closest_obstacle_col.append(np.nan)

In [26]:
fix_data = fix_data.astype({'level':'int'})

fix_data["level_difficulty"] = fix_data.level.apply(func)

In [38]:
fix_data.insert (37, 'N_visible_obstacles', N_visible_obs_col)
fix_data.insert (37, 'N_visible_drift_tiles', N_visible_drift_col)
fix_data.insert (37, 'Dist_to_closest_obstacles_in_pixel', Dist_to_closest_obstacle_col)

fix_data['Dist_to_closest_obstacles'] = fix_data.apply(lambda row: pixel_to_degree(row.Dist_to_closest_obstacles_in_pixel), axis=1)

# drop unnecessary columns for fixation data
fix_data = fix_data.drop(labels=["saccade_amplitude", "saccade_amplitude_in_pixel", "saccade_direction_y", "saccade_direction_x", "saccadeOnset", "N_saccade", "Saccade"], axis = 1)

fix_data.to_csv('eye_data/experimental_eye_data_fixations.csv', sep=',')

### Extracting saccades

In [43]:
sacc_data = eye_data_successfull_runs[eye_data_successfull_runs.saccadeOnset == 1].reset_index()

In [46]:
N_visible_obs_col = []
N_visible_drift_col = []
Dist_to_closest_obstacle_col = []

for index, row in sacc_data.iterrows():
    
    input_file_name = f'{row.ID}_output_{row.level}{str(row.drift)[0]}{row.input_noise}_done_'
    
    path = f'experimental_data/{row.ID}/data/'
    
    file_found = False
    
    for subdir, dirs, files in os.walk(path):
        for data_file in files:
            if input_file_name in str(data_file):
                
                file_found = True
                
                #print(f'searching for {data_file}')
                
                input_data = pre_process_input_data(pd.read_csv(f'{path}/{data_file}', index_col=False))
                
                closest_frame = input_data.iloc[(input_data['time_played'] - row.time_tag).abs().argsort()[0]]
                
                distances = []
                for obstacle in closest_frame.visible_obstacles:
                    dist_squared = np.power(np.abs(obstacle[0] - agent_center_x), 2) + np.power(np.abs(obstacle[1] - agent_center_y), 2)
                    distances.append(np.sqrt(dist_squared))
                
                if len(distances) > 0:
                    Dist_to_closest_obstacle = np.min(distances)
                elif len(distances) == 0:
                    Dist_to_closest_obstacle = np.nan
                    
                N_visible_obstacles = len(closest_frame.visible_obstacles)
                N_visible_drift = len(closest_frame.visible_drift_tiles)
                
                N_visible_obs_col.append(N_visible_obstacles)
                N_visible_drift_col.append(N_visible_drift)
                Dist_to_closest_obstacle_col.append(Dist_to_closest_obstacle)
    
    if not file_found:
        N_visible_obs_col.append(np.nan)
        N_visible_drift_col.append(np.nan)
        Dist_to_closest_obstacle_col.append(np.nan)

In [47]:
sacc_data = sacc_data.astype({'level':'int'})

sacc_data["level_difficulty"] = sacc_data.level.apply(func)

In [48]:
sacc_data.insert (37, 'N_visible_obstacles', N_visible_obs_col)
sacc_data.insert (37, 'N_visible_drift_tiles', N_visible_drift_col)
sacc_data.insert (37, 'Dist_to_closest_obstacles_in_pixel', Dist_to_closest_obstacle_col)

sacc_data['Dist_to_closest_obstacles'] = sacc_data.apply(lambda row: pixel_to_degree(row.Dist_to_closest_obstacles_in_pixel), axis=1)

# drop unnecessary columns for fixation data
sacc_data = sacc_data.drop(labels=["fixation_duration", "exploring_fixation", "fixationOnset", "N_fixation", "Fixation"], axis = 1)

sacc_data.to_csv('eye_data/experimental_eye_data_saccades.csv', sep=',')

## Building SoC data

Here we **won't** filter for only successful runs.

In [7]:
root_dir = os.getcwd()

data_dir = "/experimental_data/"
target_string = "output"

#done_string = "done"
#crashed_string = "crashed"

runs = []

for subdir, dirs, files in os.walk(root_dir+data_dir):
    for file in files:
        runs.append(file)
        
        #if done_string in file:
        #    runs.append(file)
        #if crashed_string in file:
        #    runs.append(file)

NameError: name 'crashed_string' is not defined

In [None]:
snippets_runs = []

for file_name in runs:
    temp = file_name.split("_")
    code = temp[0]
    exp_trial = temp[-1]
    exp_trial = exp_trial[:2]  # cut off .csv
    
    snippets_runs.append([code, exp_trial])

In [None]:
snippets_runs_df = pd.DataFrame(snippets_runs, columns = ['code', 'trial'])
snippets_runs_df

In [None]:
#%%capture --no-stderr

soc_data = pd.DataFrame()


for id_code in np.unique(snippets_runs_df.code):
    experimental_runs = list(snippets_runs_df.loc[snippets_runs_df['code'] == id_code].trial)
    
    path = root_dir + data_dir + str(id_code) + "/data"
    for subdir, dirs, files in os.walk(path):
        for data_file in files:
            if ".csv" in str(data_file):
                
                temp = str(data_file).split("_")
                # extract features of run from file_name coding
                level = temp[2][0]
                drift = temp[2][1]
                input_noise = temp[2][2]
                exp_trial = temp[-1]
                exp_trial = exp_trial[:2]
                done = temp[3]
                if int(level) > 0:  # excluding training trials
                
                    # writing to a temporal data frame
                    temp_data = pd.DataFrame({'ID': [np.nan],
                                              'level': [np.nan],
                                              'drift': [np.nan],
                                              'input_noise': [np.nan],
                                              'N_fixations': [np.nan],
                                              'N_distant_fixations': [np.nan],
                                              'N_saccades': [np.nan],
                                              'done': [np.nan],
                                              'run': [np.nan], 
                                              'SoC': [np.nan]})

                    opened_file = pre_process_input_data(pd.read_csv((f"experimental_data/{id_code}/data/{data_file}"), index_col=False))
                    temp_eye_data_file = pre_process_eye_data(pd.read_csv(f"experimental_data/{id_code}/eye_data/{id_code}_eye_tracking_output_{level}{drift}{input_noise}_{exp_trial}.csv"))
                    temp_data["ID"].iloc[0] = id_code
                    temp_data["level"].iloc[0] = level
                    temp_data["drift"].iloc[0] = drift
                    temp_data["input_noise"].iloc[0] = input_noise
                    temp_data["N_fixations"].iloc[0] = np.max(temp_eye_data_file.N_fixation)
                    temp_data["N_distant_fixations"].iloc[0] = np.sum(temp_eye_data_file.exploring_fixation)
                    temp_data["N_saccades"].iloc[0] = np.max(temp_eye_data_file.N_saccade)                                                     
                    if done == "crashed":
                        temp_data["done"].iloc[0] = False
                    elif done == "done":
                        temp_data["done"].iloc[0] = True
                    temp_data["run"].iloc[0] = exp_trial
                    temp_data["SoC"].iloc[0] = opened_file["SoC"].iloc[-1]

                    soc_data = pd.concat([soc_data, temp_data])

soc_data


### Adding additional columns

In [None]:
soc_df = pd.DataFrame()

unique_IDs = soc_data.ID.unique()

for ident in unique_IDs:
    # simultaneously sorting data by run (cronologically)
    temp_df = soc_data[soc_data.ID == ident].sort_values(['run'], ascending=True)
    
    # condition for crash in data
    cond = (temp_df.done == False)

    # have =1 everywhere condition applies and =0 where not
    temp_df["crashed"] = np.where(cond, 1, 0)
    
    # counting up crashs
    temp_df['N_prior_crashs'] = temp_df.crashed.cumsum()
    
    # we will have to subtract 1 from each row where they crashed due to cumsum already updating the crash row. 
    # But we want specifically the PRIOR crashs.
    temp_df.N_prior_crashs = temp_df.N_prior_crashs - temp_df.crashed
    
    soc_df = pd.concat([soc_df, temp_df])
    
soc_df

In [None]:

soc_df_=soc_df.assign(trials_since_last_crash=soc_df.groupby(soc_df.crashed.ne(soc_df.crashed.shift()).cumsum()).cumcount().add(1))
soc_df_.trials_since_last_crash = soc_df_.trials_since_last_crash - soc_df_.crashed

#test_df = new_soc_df.copy()

cond = (soc_df_.crashed.shift(1) > 0.0)
soc_df_["crashed_in_last_trial"] = np.where(cond, 1, 0)

cond = (soc_df_.crashed.shift(1) > 0.0) & (soc_df_.crashed == 0)
soc_df_["consecutive_crash_success"] = np.where(cond, 1, 0)

soc_df_["N_consecutive_crash_success"] = soc_df_.groupby('ID')["consecutive_crash_success"].cumsum()

soc_df_


In [None]:
soc_df_.to_csv('soc_data/soc_data.csv', sep=',', index=False)

## Building input data