# Navigation Control B - Process Data

Note: this includes the logic for updating the database for excluded participants

In [1]:
import urllib.request
from functools import lru_cache
import datetime
import csv, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from msdm.domains import GridWorld
from msdm.algorithms import PolicyIteration
from vgc_project import gridutils, sampsat

In [2]:
# load experiment parameters
expConfig = json.load(open("config.json", 'r'))
EXPERIMENT_CODE_VERSION = expConfig['params']['EXPERIMENT_CODE_VERSION']
cond1 = expConfig['timelines'][0]
basegrids = json.load(open("../mazes/mazes_12-15.json", "r"))

CREDENTIALS = json.load(open("../credentials.json", 'r'))
EXPURL = CREDENTIALS["EXPURL"]
USERNAME = CREDENTIALS["USERNAME"]
PASSWORD = CREDENTIALS["PASSWORD"]
print("Credentials:")
print(CREDENTIALS)

DESTDIR = "./data/"
if not os.path.exists(DESTDIR):
    os.mkdir(DESTDIR)
sourcedest = [
    (f"data/{EXPERIMENT_CODE_VERSION}/trialdata", DESTDIR+"rawtrialdata.csv"),
    (f"data/{EXPERIMENT_CODE_VERSION}/questiondata", DESTDIR+"rawquestiondata.csv"),
    (f"data/{EXPERIMENT_CODE_VERSION}/bonusdata", DESTDIR+"rawbonusdata.csv"),
    (f"data/{EXPERIMENT_CODE_VERSION}/conditiondata", DESTDIR+"rawconditiondata.csv")
]

password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, EXPURL, USERNAME, PASSWORD)
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

In [3]:
for SOURCE, DEST in sourcedest:
    opener = urllib.request.build_opener(handler)
    opener.open(EXPURL+SOURCE)
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(EXPURL+SOURCE, DEST)   

In [4]:
csv.field_size_limit(sys.maxsize)
trialdata = [line for line in csv.reader(open(sourcedest[0][1], 'r'))]
questiondata = [line for line in csv.reader(open(sourcedest[1][1], 'r'))]
bonusdata = [line for line in csv.reader(open(sourcedest[2][1], 'r'))]
conditiondata = [line for line in csv.reader(open(sourcedest[3][1], 'r'))]

In [5]:
trials = []
errors = []
for psiturk_id, idx, datetime_ms, entry in trialdata:
    if "debug" in psiturk_id:
        continue
    datetime_ms = datetime.datetime.fromtimestamp(float(datetime_ms)/1000.)
    idx = int(idx)
    entry = json.loads(entry)
    try:
        assert idx == entry['trial_index']
    except AssertionError as e :
        errors.append((e, psiturk_id, idx, datetime_ms, entry))
    trials.append({
        "psiturk_id": psiturk_id,
        "idx": idx,
        "datetime_ms": datetime_ms,
        "data": entry.get("data", None),
        "trial_type": entry['trial_type'],
        "time_elapsed": entry['time_elapsed'],
        "internal_node_id": float(entry["internal_node_id"].split("-")[1])
    })
trials = pd.DataFrame(trials)
trials = trials[trials.psiturk_id.apply(lambda p: "debug" not in p)]

In [6]:
print(trials.groupby('psiturk_id')['internal_node_id'].max().value_counts())

96.0    101
56.0     99
1.0       3
5.0       3
2.0       1
Name: internal_node_id, dtype: int64


In [7]:
def process_navigationData(row):
    nd = row['navigationData']
    new_nd = []
    try:
        g = basegrids[row['grid']]
    except KeyError:
        g = basegrids[f"grid-{row['grid'].split('-')[1]}-0"] #older coding of grid
        
    trans = row['transform']
    for t in nd:
        nt = {**t}
        nt['trans_state'] = t['state']
        nt['trans_nextstate'] = t['nextstate']
        s = gridutils.untransformState[trans](g, t['state'])
        nt['state'] = (int(round(s[0])), int(round(s[1])))
        ns = gridutils.untransformState[trans](g, t['nextstate'])
        nt['nextstate'] = (int(round(ns[0])), int(round(ns[1])))
        nt['rt'] = t['response_datetime'] - t['start_datetime']
        new_nd.append(nt)
    return new_nd

In [8]:
#breadcrumb trials
bc_trials = trials[(trials.trial_type == "GridNavBreadcrumbs")]
bc_trials = bc_trials[(bc_trials.data.apply(lambda d: d['trialparams']['roundtype'] != "practice"))]
bc_trials = pd.concat([
    pd.DataFrame({
        "psiturk_id": bc_trials.psiturk_id,
    }).reset_index(drop=True),
    pd.DataFrame(list(bc_trials.data.apply(lambda d: {**d['trialparams'], **{k: v for k, v in d.items() if k not in ['trialparams', 'taskparams']}}).values))
], axis=1)
bc_trials = bc_trials[['original_gridname', 'original_sessionId', 'psiturk_id', 'sessionId', 'grid', 
                       'transform', 'round', "navigationData", "allCollected"]].reset_index(drop=True)
bc_trials['navigationData'] = bc_trials.apply(process_navigationData, axis=1)
bc_trials['initial_rt'] = bc_trials['navigationData'].apply(lambda traj: traj[0]['rt'])
bc_trials['max_noninitial_rt'] = bc_trials['navigationData'].apply(lambda traj: max([t['rt'] for t in traj[1:]]))
bc_trials['totaltime'] = bc_trials['navigationData'].apply(lambda traj: max([t['response_datetime'] for t in traj]) - min([t['start_datetime'] for t in traj]))

In [9]:
original_sids = bc_trials[bc_trials['original_sessionId'].\
    apply(lambda sid: isinstance(sid, str))][['original_sessionId', 'sessionId', 'grid', 'round']].\
    reset_index(drop=True)

In [10]:
# get navdist matrics
from functools import lru_cache
from msdm.domains import GridWorld
from vgc_project import gridutils, utils

@lru_cache()
def make_gridworld(tile_array):
    return GridWorld(tile_array)
def calc_obs_traj_dist(grid, obs, traj):
    obs_locs = make_gridworld(tuple(basegrids[grid])).feature_locations[obs]
    return gridutils.min_dist(obs_locs, traj)["mindist"]
def calc_nav_dist(row):
    grid = row.grid
    traj = [t['state'] for t in row.navigationData]
    res = {}
    for obs in "0123456789":
        try:
            res[obs] = calc_obs_traj_dist(grid, obs, traj)
        except KeyError:
            continue
    return pd.Series(res)
    
nav_dist = bc_trials.apply(calc_nav_dist, axis=1)
nav_dist = \
    pd.concat([bc_trials[['sessionId', 'grid', 'round']], nav_dist], axis=1).\
    melt(id_vars=['sessionId', 'grid', 'round'], var_name="obstacle", value_name="nav_mindist")

In [11]:
#awareness trials
attn_trials = trials[(trials.trial_type == "GridBlockAttentionQuery")]
attn_trials = pd.concat([
    pd.DataFrame({
        "psiturk_id": attn_trials.psiturk_id,
    }).reset_index(drop=True),
    pd.DataFrame(list(attn_trials.data.values))
], axis=1)
attn_trials['attn_resp'] = attn_trials['response'].apply(int)
attn_trials['attn_resp_N'] = (attn_trials['attn_resp'] - 1)/(8 - 1)
attn_trials['rt'] = attn_trials['responsetime'] - attn_trials['starttime']
attn_trials['obstacle'] = attn_trials['probeobs']
attn_trials['transform'] = attn_trials['trans']
attn_trials = attn_trials[['psiturk_id', 'sessionId', 'grid', 'transform', 'round', 'starttime', 'responsetime',
                           'queryround', 'obstacle', 'rt', 'attn_resp', 'attn_resp_N']].reset_index(drop=True)
attn_trials = attn_trials.merge(nav_dist, on=['sessionId', 'grid', 'round', 'obstacle'], how='left')
attn_trials = attn_trials.merge(original_sids, on=['sessionId', 'round', 'grid'], how='left')

In [12]:
#memory trials
GREEN = '#44A9A0'
YELLOW = "#DCCB5D"
mem_trials = trials[(trials.trial_type.isin(["CustomItem"]))]
mem_trials = pd.concat([
    pd.DataFrame({
        "psiturk_id": mem_trials.psiturk_id,
    }).reset_index(drop=True),
    pd.DataFrame(list(mem_trials.data.values))
], axis=1)
mem_trials['rt'] = mem_trials['responsetime'] - mem_trials['starttime']
mem_trials['true_color'] = mem_trials['true_color'].apply(lambda c: {YELLOW: 'Yellow', GREEN: "Green"}[c])
mem_trials = mem_trials[['psiturk_id', 'sessionId', 'round', 'roundtype', 'grid', 'transform', 
                           'probeobs', 'queryround', 'true_color', 'response', 'rt']].reset_index(drop=True)
mem_trials = mem_trials.pivot(index=['psiturk_id', 'sessionId', 'round', 'grid', 'transform', 'probeobs', 'queryround'], columns=['roundtype']).reset_index()
mem_trials["conf_rt"] = mem_trials[('rt', 'probe_conf')]
mem_trials["mem_rt"] = mem_trials[('rt', 'probe_2afc')]
mem_trials["conf"] = mem_trials[('response', 'probe_conf')].apply(int)
mem_trials["conf_N"] = (mem_trials["conf"] - 1)/(8 - 1)
mem_trials["correct"] = mem_trials[('true_color', 'probe_2afc')] == mem_trials[('response', 'probe_2afc')]
mem_trials["mem_resp"] = mem_trials[('response', 'probe_2afc')]
mem_trials["obstacle"] = mem_trials['probeobs']
mem_trials = mem_trials[['psiturk_id', 'sessionId', 'round', 'grid', 'transform', 'obstacle',
                         'queryround', 'mem_resp', 'correct', 'mem_rt', 'conf_rt', 'conf', 'conf_N']]
mem_trials.columns = mem_trials.columns.droplevel(level=1)
mem_trials = mem_trials.merge(nav_dist, on=['sessionId', 'grid', 'round', 'obstacle'], how='left')
mem_trials = mem_trials.merge(original_sids, on=['sessionId', 'round', 'grid'], how='left')

In [13]:
assert len(original_sids)*5 == (len(mem_trials) + len(attn_trials))

In [14]:
#survey level data
survey_trials = trials[(trials.trial_type.isin(["CustomSurvey", "SaveGlobalStore"]))]
survey_trials = pd.concat([
    pd.DataFrame({
        "psiturk_id": survey_trials.psiturk_id,
    }).reset_index(drop=True),
    pd.DataFrame([d if d else {} for d in survey_trials.data.values])
], axis=1)
survey_trials = survey_trials.melt(id_vars='psiturk_id', var_name="question", value_name="response")
survey_trials = survey_trials[~survey_trials.response.isna()]
survey_trials = survey_trials.pivot(index='psiturk_id', columns="question" )
survey_trials.columns = survey_trials.columns.droplevel()
survey_trials = survey_trials.reset_index()
survey_trials.columns.name = None
survey_trials = survey_trials[survey_trials['playedGame'].isin(["Yes", "No"])].reset_index(drop=True)
completed_task = survey_trials['psiturk_id']

In [15]:
bc_trials["initial_rt_exclude"] = bc_trials.apply(lambda r: r['initial_rt'] > 5000, axis=1)
bc_trials["max_noninitial_rt_exclude"] = bc_trials.apply(lambda r: r['max_noninitial_rt'] > 2000, axis=1)
bc_trials["not_allCollected"] = ~bc_trials['allCollected']

In [16]:
# ex_trials = bc_trials.groupby("sessionId")[["initial_rt_exclude", "max_noninitial_rt_exclude", "not_allCollected"]].sum()
# ex_trials[ex_trials.apply(sum, axis=1) > .2*8]

In [17]:
# calculate exclusions
survey_trials['exclude_participant'] = survey_trials.apply(
    lambda r: "yes" in r['playedGame'].lower(),
    axis=1
)
bc_trials["exclude_trial"] = bc_trials.apply(
    lambda r: (r['initial_rt'] > 5000) or (r['max_noninitial_rt'] > 2000) or (not r['allCollected']),
    axis=1
)
attn_trials = attn_trials.merge(bc_trials[["sessionId", "round", "exclude_trial"]], on=['sessionId', 'round'])
mem_trials = mem_trials.merge(bc_trials[["sessionId", "round", "exclude_trial"]], on=['sessionId', 'round'])
exclude_df = bc_trials.groupby(['psiturk_id', 'sessionId'])['exclude_trial'].apply(lambda et: (((8 - len(et)) + sum(et))/8 > .2)).reset_index(name="exclude_trials")\
    .merge(survey_trials[['sessionId', 'psiturk_id', 'exclude_participant', 'condition']], on=['psiturk_id','sessionId'], how='outer')
exclude_df['any_exclude'] = exclude_df['exclude_trials'] | exclude_df['exclude_participant']
sid_to_exclude = set(exclude_df[exclude_df['any_exclude']]['sessionId'].values)
pid_to_exclude = set(exclude_df[exclude_df['any_exclude']]['psiturk_id'].values)
survey_trials['exclude'] = survey_trials.sessionId.isin(sid_to_exclude)
del survey_trials['exclude_participant']
assert len(sid_to_exclude) == len(pid_to_exclude)
all_pid = set([pid for pid, cond in conditiondata])
pid_no_data = all_pid - (set(survey_trials.psiturk_id) | set(bc_trials.psiturk_id))
print(f"Total Participants with data: {len(survey_trials)}")
print(f"Participants with too many invalid trials: {exclude_df['exclude_trials'].sum()}")
print(f"Participants who previously played task: {exclude_df['exclude_participant'].sum()}")
print(f"Participants to Exclude: {exclude_df['any_exclude'].sum()}")
print(f"Participants with no data: {len(pid_no_data)}")

Total Participants with data: 200
Participants with too many invalid trials: 36
Participants who previously played task: 4
Participants to Exclude: 39
Participants with no data: 15


In [18]:
survey_trials = survey_trials.merge(pd.concat([
    pd.DataFrame(attn_trials.sessionId.unique(), columns=['sessionId']).assign(dv='attn'),
    pd.DataFrame(mem_trials.sessionId.unique(), columns=['sessionId']).assign(dv='mem')
]), on='sessionId', how='outer')

In [19]:
survey_trials.groupby('dv')['exclude'].agg(['count', 'sum'])

Unnamed: 0_level_0,count,sum
dv,Unnamed: 1_level_1,Unnamed: 2_level_1
attn,99,18
mem,101,21


In [20]:
# updates psiturk database to recruit conditions appropriately
# it is idempodent
for i, pid in enumerate(pid_to_exclude | pid_no_data):
    new_status = "ignore"
    print(i, pid)
    request = f"http://frozen-depths-23358.herokuapp.com/set_status?uniqueId={pid}&new_status={new_status}"
    res = urllib.request.urlopen(request).read()
    res = json.loads(res)
    if "success" not in res['status'].lower():
        print(res['status'], request)

0 60abec02335b7be9cb42c01c:60cbba634504ce7806c71b71
1 5ca650fc557aec0012e200d8:60cb9d327e5470a525650644
2 605c9274dc5f3a6bffa23b54:60cb9a5a75f66841dc11c03a
3 5fe8be5dad8fba7b047f7f7d:60cb9d3da33441958dfcb402
4 5e2dd0b7f420f03c325e5c9d:60cb9fd64cfedba43b22e9ba
5 60be818f4a5c49320cf90daf:60cbba768768e52b51a657cc
6 5c50efd2cc71f4000125ce0d:60cb9aa56604e42ed8a0ca3a
7 597519f8262c480001bbaf8b:60cbd2406f29affdee487794
8 5c28ef690091e40001ca5e99:60cb9a58959f6aa88e37f9b3
9 5d87296392bb88000182f43a:60cb9fd3c1f6cf29d610e987
10 593af1304f76be000186d4ff:60cb9fdbe6ef2dbf67e4bdf5
11 60c29201092d523e3b08a603:60cbdebc0594758bbde5f0fb
12 607d30af8440153a603528b3:60cba1c1e1ea76ffa6a513a9
13 603ad47e19f2636db910ab23:60cb9fd9d5916c627576649d
14 607d1a66b24445c7aeecfcd1:60cb9d352b43343e65bd6348
15 5f85f0cab3f4e20ebf578203:60cb9a5c9aac5e755ec3d967
16 5f0ab4523d61a638b1456878:60cbe21586e9964ed422be47
17 5c846aba6a4b9b0016c854f5:60cb9fd811b4d0ce45a597f7
18 572f897aad13160009008979:60cb9d3a4b2e3bfd3482ab01
19 

In [21]:
# save data before exclusions
survey_trials.to_json("./data/all-survey-trials.json")
attn_trials.to_json("./data/all-attention-trials.json")
mem_trials.to_json("./data/all-memory-trials.json")
bc_trials.to_json("./data/all-nav-trials.json")

# exclusions
survey_trials = survey_trials[~survey_trials.exclude].reset_index(drop=True)
attn_trials = attn_trials[~attn_trials['exclude_trial'] & attn_trials.sessionId.isin(survey_trials.sessionId)].reset_index(drop=True)
mem_trials = mem_trials[~mem_trials['exclude_trial'] & mem_trials.sessionId.isin(survey_trials.sessionId)].reset_index(drop=True)
bc_trials = bc_trials[~bc_trials['exclude_trial'] & bc_trials.sessionId.isin(survey_trials.sessionId)].reset_index(drop=True)

In [22]:
# final conditions counts
survey_trials.condition.value_counts().value_counts()

1    151
2      5
Name: condition, dtype: int64

In [23]:
print(f"Attention participants: {len(attn_trials.sessionId.unique())}")
print(f"Memory participants: {len(mem_trials.sessionId.unique())}")

Attention participants: 81
Memory participants: 80


In [24]:
# sanity check that bc trials and attention trial counts line up
assert all(bc_trials.groupby('sessionId')['round'].count() > (8*.8))
assert len(bc_trials) == len(set(zip(attn_trials.sessionId, attn_trials['round'])) | set(zip(mem_trials.sessionId, mem_trials['round'])))

# Bonusing

In [25]:
bonusdf = pd.DataFrame(bonusdata, columns=['psiturk_id', 'bonus'])
bonusdf['bonus'] = bonusdf['bonus'].apply(float)
bonusdf = bonusdf[bonusdf['psiturk_id'].isin(completed_task)]
tot_b = 0
for _, r in bonusdf.iterrows():
    b = max(r.bonus, 0)
    if b == 0.0:
        continue
    tot_b += b
    print(f"{r.psiturk_id.split(':')[0]},{b:.2f}")
print(tot_b)

60b8f470036424e02185689b,1.20
6044ca22bc6235555362d5bb,1.20
604d0733919f70839ae9e1e7,1.20
5f310877a901af046cd5f569,1.20
5e474779c397b42123861f65,1.20
5bfd667f1fe1e50001406d4c,1.20
5823405287f6b90001f14290,1.20
5ee18255395f8655ae190e48,1.20
5effb47cf7b0240ac69994d4,1.20
5995b1d2f9db7d000189597e,1.20
60c67f7c98690351d580de6c,1.20
60c5384a49764a155deae81b,1.20
5f52c36a573b995ca8add5be,1.20
5af6e439672ea800010d4e92,1.20
5f985f3a4bc1cf120675f408,1.20
60bd350431ab0f24daafa597,1.20
5e3cb8b34a4f380aa4e72538,1.20
58556f5f71a3ef0001cc94f1,1.20
5bcddbacb51f5a000199ab61,1.20
60c5e4a15e894740e0ad0e90,1.20
608b47f65c8859645d0a0b23,1.20
5a69257731b87a0001c76293,1.20
5ea9c475ec2b531108f86a3f,1.20
5f8ac37961bf52177f0ec44e,1.20
5c28ef690091e40001ca5e99,0.30
5ce65aa96ad4e50001097566,1.20
603503a83b6ae7c69d2b45aa,1.05
6057863860c7fd45f3fbcd0d,1.20
60b78dc1eb46833a5c6b1235,1.20
5fa1b6e6e1083d319115d1eb,1.20
5ebe92a9912dc20b982ef31c,1.20
6064cfcfde0f09850a5c7fc8,1.20
59e245fac0d35a0001294e65,1.20
5f85f0cab3