# Exp 4b Hovering - Process Data

In [1]:
import urllib.request
import datetime
import csv, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1]:
# load experiment parameters
expConfig = json.load(open("config.json", 'r'))
EXPERIMENT_CODE_VERSION = expConfig['params']['EXPERIMENT_CODE_VERSION']
cond1 = expConfig['timelines'][0]

CREDENTIALS = json.load(open("../credentials.json", 'r'))
EXPURL = CREDENTIALS["EXPURL"]
USERNAME = CREDENTIALS["USERNAME"]
PASSWORD = CREDENTIALS["PASSWORD"]
# print("Credentials:")
# print(CREDENTIALS)

DESTDIR = "./data/"
if not os.path.exists(DESTDIR):
    os.mkdir(DESTDIR)
    
sourcedest = [
    (f"data/{EXPERIMENT_CODE_VERSION}/trialdata", DESTDIR+"rawtrialdata.csv"),
    (f"data/{EXPERIMENT_CODE_VERSION}/questiondata", DESTDIR+"rawquestiondata.csv"),
    (f"data/{EXPERIMENT_CODE_VERSION}/bonusdata", DESTDIR+"rawbonusdata.csv")
]

password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, EXPURL, USERNAME, PASSWORD)
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

In [3]:
try:
    for SOURCE, DEST in sourcedest:
        opener = urllib.request.build_opener(handler)
        opener.open(EXPURL+SOURCE)
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(EXPURL+SOURCE, DEST)   
except urllib.error.HTTPError as e:
    print("HTTPError raised: ", e)
    print("Continuing without downloading data")

HTTPError raised:  HTTP Error 401: UNAUTHORIZED
Continuing without downloading data


In [4]:
csv.field_size_limit(sys.maxsize)
trialdata = [line for line in csv.reader(open(sourcedest[0][1], 'r'))]
questiondata = [line for line in csv.reader(open(sourcedest[1][1], 'r'))]
bonusdata = [line for line in csv.reader(open(sourcedest[2][1], 'r'))]

In [5]:
trials = []
errors = []
for psiturk_id, idx, datetime_ms, entry in trialdata:
    if "debug" in psiturk_id:
        continue
    datetime_ms = datetime.datetime.fromtimestamp(float(datetime_ms)/1000.)
    idx = int(idx)
    entry = json.loads(entry)
    try:
        assert idx == entry['trial_index']
    except AssertionError as e :
        errors.append((e, psiturk_id, idx, datetime_ms, entry))
    trials.append({
        "psiturk_id": psiturk_id,
        "idx": idx,
        "datetime_ms": datetime_ms,
        "data": entry.get("data", None),
        "trial_type": entry['trial_type'],
        "time_elapsed": entry['time_elapsed'],
        "internal_node_id": float(entry["internal_node_id"].split("-")[1])
    })
trials = pd.DataFrame(trials)
trials = trials[trials.psiturk_id.apply(lambda p: "debug" not in p)]

In [6]:
#survey level data
survey_trials = trials[(trials.trial_type.isin(["CustomSurvey", "SaveGlobalStore"]))]
survey_trials = pd.concat([
    pd.DataFrame({
        "psiturk_id": survey_trials.psiturk_id,
    }).reset_index(drop=True),
    pd.DataFrame([d if d else {} for d in survey_trials.data.values])
], axis=1)
survey_trials = survey_trials.melt(id_vars='psiturk_id', var_name="question", value_name="response")
survey_trials = survey_trials[~survey_trials.response.isna()]
survey_trials = survey_trials.drop_duplicates()
survey_trials = survey_trials.pivot(index='psiturk_id', columns="question")
survey_trials.columns = survey_trials.columns.droplevel()
survey_trials = survey_trials.reset_index()
survey_trials.columns.name = None
completed_task = survey_trials['psiturk_id']

In [7]:
# for _, row in survey_trials.iterrows():
#     if row.generalComments:
#         print(row.sessionId)
#         print(row.generalComments)
#         print(row.gender)
#         print()

In [8]:
# Main trials
nav_trials = trials[(trials['trial_type'] == 'GridNavigationHoverReveal')]
nav_trials = pd.concat([
    pd.DataFrame({
        "psiturk_id": nav_trials.psiturk_id
    }).reset_index(drop=True),
    pd.DataFrame(list(
        nav_trials.data.apply(lambda d: {**d['trialparams'], **{k: v for k, v in d.items() if k not in ['trialparams', 'taskparams']}}).values
    ))
], axis=1)
nav_trials = nav_trials[nav_trials['roundtype'] == 'main'].reset_index(drop=True)
nav_trials = nav_trials[[
    'psiturk_id', 'sessionId', 'round', 'grid','transformation', 
    'navigationData', 'mouseMoveData', 'mouseHoverData'
]]

In [9]:
from collections import defaultdict

HOVER_TRIAL_EXCLUSION_CUTOFF = .5
TOTAL_ROUNDS = 12
def hoverdurations(hoverdata, only_obs=True):
    dur = defaultdict(int)
    for e in hoverdata:
        assert e['visible']
        if only_obs:
            if e['obstacle'] not in '0123456789':
                continue
        else:
            if e['obstacle'] == '.':
                continue
        dur[e['obstacle']] += e['exitTime'] - e['enterTime']
    dur['any_hovering'] = len(dur) > 0
    return dict(dur)

hover_df = pd.concat([
    nav_trials[['psiturk_id', 'sessionId', 'round', 'grid', 'transformation']],
    nav_trials['mouseHoverData'].apply(lambda d: pd.Series(hoverdurations(d, only_obs=True)))
], axis=1)
all_sid = hover_df.sessionId.unique()
n_no_hover_trials = sum(~hover_df['any_hovering'])
print(f"Number of trials with no hovering: {n_no_hover_trials} of {len(hover_df)}")
sid_exclude = hover_df.groupby(["sessionId"], as_index=True)['any_hovering'].apply(lambda h: sum(h)/TOTAL_ROUNDS) < HOVER_TRIAL_EXCLUSION_CUTOFF
sid_exclude = sid_exclude.index[sid_exclude]
print(f"Participants excluded: ({len(sid_exclude)} of {len(hover_df.sessionId.unique())}) {list(sid_exclude)}")

hover_df = hover_df[hover_df['any_hovering']]
hover_df = hover_df[~hover_df['sessionId'].isin(sid_exclude)]
hover_df = hover_df.melt(
    value_vars=list('01234'),
    id_vars=['psiturk_id', 'sessionId', 'round', 'grid', 'transformation'],
    value_name="hoverduration",
    var_name="obstacle"
)
hover_df['log_hoverduration'] = np.log(hover_df['hoverduration'])
mean_loghd = hover_df['log_hoverduration'].mean()
std_loghd = hover_df['log_hoverduration'].std()
hover_df['hovered'] = hover_df['log_hoverduration'].apply(lambda d: (not np.isnan(d)) and (d > (mean_loghd - 2*std_loghd)))
print(f"log hoverduration cutoff: {(mean_loghd - 2*std_loghd):.2f}")
print(f"hoverduration cutoff: {np.exp(mean_loghd - 2*std_loghd):.2f} ms")

assert all((hover_df.groupby('sessionId')['round'].count()/(TOTAL_ROUNDS*5)) >= .5)

Number of trials with no hovering: 130 of 2256
Participants excluded: (9 of 188) ['4UT33mdRRD', 'LxjZeOqzBl', 'Ujc1OfHC30', 'XvVfo1HNAK', 'g1oVbEsZw6', 'hIDMKB74JZ', 'idxCIL3VZ7', 'pTqCRtp0Vc', 'zdfb1skG04']
log hoverduration cutoff: 3.82
hoverduration cutoff: 45.45 ms


In [10]:
survey_trials = survey_trials[survey_trials.sessionId.isin(hover_df.sessionId.unique())]
survey_trials = survey_trials.drop("psiturk_id", axis=1)

In [11]:
hover_df.to_json("./data/hovering-data-trials.json")
nav_trials.to_json("./data/all-navigation-trials.json")
survey_trials.to_json("./data/participantdata.json")