# Parse PCIbex results

This notebook parses the messy results file `results_dev.csv` and produces an event-level DataFrame with:

- participant_id, group, label (block), no, item, condition, cb, left, right
- Self-paced reading region events (r1..r7) with per-region RTs, including mean and std per participant, item, condition, region question RTs and choice selection (side + RT)
- Eye-tracker events

In [336]:
# Imports and paths
import re
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(r"c:\\Users\\parti\\Projects\\pcibex-hun")
RAW_FILE = ROOT / "results_dev.csv"

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 50)

print("Will parse:", RAW_FILE)

Will parse: c:\Users\parti\Projects\pcibex-hun\results_dev.csv


In [337]:
# Read file, split comments vs data
raw_lines = RAW_FILE.read_text(encoding="utf-8").splitlines()

header_comments = []
rows = []
for line in raw_lines:
    if line.startswith("#"):
        header_comments.append(line)
    elif line.strip():
        rows.append(line)

print(f"Comment lines: {len(header_comments)} | Data rows: {len(rows)}")
rows[:3]

Comment lines: 48 | Data rows: 111


['1758538209,307ffaaf2defd7df9a4f72f267526c6d,PennController,0,0,welcome,NULL,PennController,0,_Trial_,Start,1758538019972,prolific_id,NULL',
 '1758538209,307ffaaf2defd7df9a4f72f267526c6d,PennController,0,0,welcome,NULL,PennController,0,_Header_,Start,1758538019972,prolific_id,NULL',
 '1758538209,307ffaaf2defd7df9a4f72f267526c6d,PennController,0,0,welcome,NULL,PennController,0,_Header_,End,1758538019972,prolific_id,NULL']

## Events

In [338]:
# ...existing code...
# Define the base schema described by comments just before each block
base_cols = [
    "ResultsTime", "MD5", "Controller", "Order", "Inner", "Label",
    "LatinSquare", "PennElementType", "PennElementName", "Parameter",
    "Value", "EventTime", "prolific_id",
]

# Do NOT create "extraN" columns. Instead, infer the appended (trial) column names
# from the header comments, then map the trailing fields directly to those names.
import re

def extract_trial_fields_from_comments(comments):
    # We expect something like a line that mentions these names.
    allowed = ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]
    # accept common variants (participant-id, participant id, participantid, prolific_id etc.)
    pattern = re.compile(r"(prolific[_\-\s]?id|participant[_\-\s]?id|participantid|group|no|item|exp|condition|cb|left|right|target)", re.I)

    best_line = None
    max_hits = 0
    # prefer the comment line that contains the most known tokens
    for c in comments:
        hits = len(pattern.findall(c))
        if hits > max_hits:
            best_line = c
            max_hits = hits

    # if we found a useful header line, try to parse explicit comma-separated tokens after a colon
    if best_line and max_hits >= 6:
        text = best_line.split(":", 1)[-1]
        tokens = [t.strip() for t in re.split(r"[,\t]+", text) if t.strip()]

        def norm(s):
            s2 = re.sub(r"[^A-Za-z0-9_]", "", s).lower()
            if s2 == "participantid":
                s2 = "participant_id"
            if s2 == "prolificid":
                s2 = "prolific_id"
            return s2

        names = []
        for t in tokens:
            n = norm(t)
            if n in allowed and n not in names:
                names.append(n)

        # fallback: if explicit tokens not present, use the order of regex matches in the line
        if not names:
            for m in pattern.finditer(best_line):
                n = norm(m.group(0))
                if n in allowed and n not in names:
                    names.append(n)

        # Preserve canonical order relative to allowed list
        ordered = [name for name in allowed if name in names]
        # accept extraction if we found at least participant_id + several others
        if "participant_id" in ordered and len(ordered) >= 6:
            return ordered

    # Final fallback: canonical full trial ordering (updated to include exp and target)
    return ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]

TRIAL_FIELDS = extract_trial_fields_from_comments(header_comments)
print("Using trial fields:", TRIAL_FIELDS)

# Parse each data row into a record using base_cols + TRIAL_FIELDS depending on Label

def parse_row_to_record(line: str):
    parts = [p.strip() for p in line.split(",")]
    rec = {}
    base_vals = parts[:len(base_cols)]
    extra_vals = parts[len(base_cols):]

    for k, v in zip(base_cols, base_vals):
        rec[k] = v

    # If the extra values accidentally include a literal header token as the first element
    # (some CSV dumps include the header name), drop it so alignment works.
    if extra_vals and str(extra_vals[0]).lower() in {f.lower() for f in TRIAL_FIELDS + ["prolific_id"]}:
        # drop single leading header-like token
        extra_vals = extra_vals[1:]

    label = rec.get("Label")
    if label in ("practice", "experiment"):
        n = min(len(extra_vals), len(TRIAL_FIELDS))
        for k, v in zip(TRIAL_FIELDS[:n], extra_vals[:n]):
            rec[k] = v
    elif label in ("participant_data",):
        if extra_vals:
            # participant_data frequently stores id in first extra token (after possible header token)
            rec["participant_id"] = extra_vals[0]

    return rec

records = [parse_row_to_record(l) for l in rows]
raw_df = pd.DataFrame.from_records(records)

# Cast some known numeric columns where possible
for c in ["ResultsTime", "Order", "Inner", "EventTime", "no", "item"]:
    if c in raw_df.columns:
        raw_df[c] = pd.to_numeric(raw_df[c], errors="coerce")

# Human-readable timestamps
raw_df["results_time"] = pd.to_datetime(raw_df["ResultsTime"], unit="s", utc=True)

# IMPORTANT: Keep EventTime both as numeric milliseconds and as a proper timestamp (UTC)
# - EventTime_ms: numeric milliseconds for computations (diffs, means)
# - event_time: Pandas Timestamp (UTC) for readability
raw_df["event_time_ms"] = raw_df["EventTime"]
raw_df["event_time"] = pd.to_datetime(raw_df["event_time_ms"], unit="ms", utc=True)

################################################

# Helper: derive per-trial fields now that we named extras explicitly in raw_df
# We keep a light decoder only to ensure participant_id is present and to normalize types.

EXPECTED_FIELDS = ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]

# Make sure that item and no are integers
for col in ["no", "item"]:
    if col in raw_df.columns:
        # Use pandas nullable Int64 type to keep NaNs and force integer dtype
        raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce").astype('Int64')
        
# Ensure all expected columns exist even if missing in some rows
for col in EXPECTED_FIELDS:
    if col not in raw_df.columns:
        raw_df[col] = None

# Build df and forward fill participant_id/group as before
df = raw_df.copy()

# Forward-fill participant_id only within blocks of identical participant_id (no cross-over)
# (keep as-is: do not fill across participants)
if "participant_id" in df.columns:
    # nothing to do beyond keeping column present; participant ids should appear in rows where provided

    # If you need to forward-fill within contiguous blocks uncomment:
    # df['participant_id'] = df['participant_id'].ffill()

    pass

# Backward-fill group only within blocks of identical participant_id (no cross-over)
if "group" in df.columns and "participant_id" in df.columns:
    # normalize empty/NULL then backfill per participant using transform to avoid groupby.apply deprecation
    df["group"] = df["group"].replace({"": None, "NULL": None})
    df["group"] = df.groupby("participant_id")["group"].transform(lambda s: s.bfill())

# Derive block-type flags
# df["is_practice"] = df["Label"].eq("practice")
# df["is_experiment"] = df["Label"].eq("experiment")

# For convenience: also include a local-time copy if desired (commented)
# df["results_timestamp_local"] = df["results_timestamp"].dt.tz_convert("Europe/Budapest")

# Drop Controller column
df.drop(columns=["ResultsTime", "Controller", "Inner", "LatinSquare", "EventTime", "prolific_id"], inplace=True)

# Add simple elapsed time between events: current row EventTime_ms minus previous row EventTime_ms
if 'elapsed_ms' in df.columns:
    df.drop(columns=['elapsed_ms'], inplace=True)
df['elapsed_ms'] = df['event_time_ms'].diff()
df.drop(columns=['event_time_ms'], inplace=True)

# Drop rows where the Parameter column starts and ends with '_'
df = df[~df["Parameter"].str.match(r"^_.*_$", na=False)]

# Keep only rows where Label is practice or experiment
df = df[df["Label"].isin(["practice", "experiment"])]

# In Values column rename right_canvas_practice to right_canvas
df["Value"] = df["Value"].replace({"right_canvas_practice": "right_canvas"})
df["Value"] = df["Value"].replace({"left_canvas_practice": "left_canvas"})

# Rename Label to label and Order to order
df.rename(columns={"Label": "label", "Order": "trial"}, inplace=True)
df
# ...existing code...

Using trial fields: ['participant_id', 'group', 'no', 'item', 'exp', 'condition', 'cb', 'left', 'right', 'target']


Unnamed: 0,MD5,trial,label,PennElementType,PennElementName,Parameter,Value,participant_id,group,no,item,exp,condition,cb,left,right,target,results_time,event_time,elapsed_ms
28,307ffaaf2defd7df9a4f72f267526c6d,6,practice,EyeTracker,tracker,calibration,64,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:06.741000+00:00,5022.0
29,307ffaaf2defd7df9a4f72f267526c6d,6,practice,EyeTracker,tracker,Filename,httpsfarmpcibexnetrDigcCS/05d1b997-cbb5-be10-8...,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:17.294000+00:00,10553.0
30,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Canvas,right_canvas,Print,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:15.105000+00:00,-2189.0
31,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r0,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:10.310000+00:00,-4795.0
32,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r1,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:10.581000+00:00,271.0
33,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r2,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:10.809000+00:00,228.0
34,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r3,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:11.044000+00:00,235.0
35,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r4,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:11.291000+00:00,247.0
36,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r5,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:11.542000+00:00,251.0
37,307ffaaf2defd7df9a4f72f267526c6d,6,practice,Key,r6,PressedKey,,Maris444,c,990,900,,practice,n,A,B,A,2025-09-22 10:50:09+00:00,2025-09-22 10:48:11.804000+00:00,262.0


## Participants

In [339]:
def extract_participant_info(events_df, fallback_df=None):
    # Helper: return first non-empty normalized value for a column
    def pick_first(df_like, col):
        if df_like is None or col not in df_like.columns:
            return None
        s = df_like[col].replace({"": None, "NULL": None}).dropna().astype(str)
        return s.iloc[0] if len(s) > 0 else None

    # Extract participant_id, group, results_time (prefer events_df, fallback to fallback_df)
    pid = pick_first(events_df, "participant_id") or pick_first(fallback_df, "participant_id")
    group = pick_first(events_df, "group") or pick_first(fallback_df, "group")
    results_time = pick_first(events_df, "results_time") or pick_first(fallback_df, "results_time")

    # Exact requirement: Value where PennElementType == 'EyeTracker' and Parameter == 'Filename'
    def find_et_filename(df_like):
        if df_like is None or not {"PennElementType", "Parameter", "Value"}.issubset(df_like.columns):
            return None
        pet = df_like["PennElementType"].astype(str).str.lower()
        par = df_like["Parameter"].astype(str).str.lower()
        mask = pet.eq("eyetracker") & par.eq("filename")
        vals = df_like.loc[mask, "Value"].replace({"": None}).dropna().astype(str)
        return vals.iloc[0] if len(vals) > 0 else None

    et = find_et_filename(events_df) or find_et_filename(fallback_df)

    return pd.DataFrame([
        {
            "participant_id": pid,
            "group": group,
            "eyetracker_filename": et,
            "results_time": results_time,
        }
    ])

# Build participants_df for all participants (uses raw_df as fallback to find filename)
parts = []
df_nonnull = df[df['participant_id'].notna()]
for pid, g in df_nonnull.groupby('participant_id', sort=False):
    parts.append(extract_participant_info(g, fallback_df=raw_df))
participants_df = pd.concat(parts, ignore_index=True)
participants_df

Unnamed: 0,participant_id,group,eyetracker_filename,results_time
0,Maris444,c,httpsfarmpcibexnetrDigcCS/05d1b997-cbb5-be10-8...,2025-09-22 10:50:09+00:00


In [340]:
# Remove certain participants
list_of_participants_to_remove = ['Yun888', 'Test999'] ####
participants_df = participants_df[~participants_df['participant_id'].isin(list_of_participants_to_remove)]
df = df[~df['participant_id'].isin(list_of_participants_to_remove)]

participants_df

Unnamed: 0,participant_id,group,eyetracker_filename,results_time
0,Maris444,c,httpsfarmpcibexnetrDigcCS/05d1b997-cbb5-be10-8...,2025-09-22 10:50:09+00:00


## Longform events

In [341]:
# Filter for experiment/practice trials only
df_trials = df[df['label'].isin(['experiment', 'practice'])].copy()

# Sort for deterministic grouping
df_trials = df_trials.sort_values(['participant_id', 'no', 'item'])

# Region RTs: r1..r7
region_names = [f"r{i}" for i in range(1, 8)]
is_region = df_trials['PennElementName'].str.lower().isin(region_names) & df_trials['Parameter'].str.lower().eq('pressedkey')
regions = df_trials[is_region].copy()
regions['region_idx'] = regions['PennElementName'].str.extract(r'r(\d)')[0].astype(int)

# Build a full trial index to ensure all trials are present
# NOTE: added 'exp' and 'target' to match new appended columns ordering
trial_index_cols = ['results_time', 'participant_id', 'group', 'trial', 'label', 'no', 'item', 'exp', 'condition', 'cb', 'left', 'right', 'target']
# Ensure trial_index_cols exist in df_trials before selecting (EXPECTED_FIELDS earlier guarantees presence)
trial_index = df_trials.drop_duplicates(subset=[c for c in trial_index_cols if c in df_trials.columns])[[c for c in trial_index_cols if c in df_trials.columns]].sort_values([c for c in trial_index_cols if c in df_trials.columns])

# Compute region RTs per trial (item) using average of elapsed_ms
region_mean = regions.pivot_table(
    index=[c for c in trial_index_cols if c in regions.columns],
    columns='region_idx',
    values='elapsed_ms',
    aggfunc='mean',
    fill_value=np.nan
)

# Reindex to include any trials that had no region rows
region_mean = region_mean.reindex(trial_index.set_index([c for c in trial_index_cols if c in trial_index.columns]).index, fill_value=np.nan)

# Rename numeric region columns to r1..r7
region_mean = region_mean.rename(columns={i: f"r{int(i)}" for i in region_mean.columns})

# Question RT: average elapsed_ms for question keypress
is_question = df_trials['PennElementName'].str.lower().eq('question') & df_trials['Parameter'].str.lower().eq('pressedkey')
questions = df_trials[is_question].copy()
questions = questions.sort_values(['participant_id', 'no', 'item'])
questions['question_rt'] = questions['elapsed_ms']
question_rt = questions.groupby([c for c in trial_index_cols if c in questions.columns])['question_rt'].mean()

# Choice RT and value: average elapsed_ms for choice selection
is_choice = df_trials['PennElementType'].str.lower().eq('selector') & df_trials['Parameter'].str.lower().eq('selection')
choices = df_trials[is_choice].copy()
choices = choices.sort_values(['MD5', 'participant_id', 'no', 'item'])
choices['choice_rt'] = choices['elapsed_ms']
choices_value = choices.groupby([c for c in trial_index_cols if c in choices.columns])['Value'].first()
choice_rt = choices.groupby([c for c in trial_index_cols if c in choices.columns])['choice_rt'].mean()

# Assemble longform DataFrame
longform = region_mean.copy()

longform['question_rt'] = question_rt
longform['choice_rt'] = choice_rt
longform['choice'] = choices_value

# Rename choice values from left_canvas/right_canvas to left/right
longform['choice'] = longform['choice'].replace({"left_canvas": "left", "right_canvas": "right"})

# Subtract fixed offsets (keep units consistent with elapsed_ms)
longform['question_rt'] = longform['question_rt'] - 1000
longform['choice_rt'] = longform['choice_rt'] - 3000

# Sort by participant and trial for deterministic ordering
# If 'trial' missing in index columns fallback to sorting by ['participant_id', 'no', 'item']
sort_cols = ['participant_id', 'trial'] if 'trial' in longform.index.names else ['participant_id', 'no', 'item']
longform = longform.sort_values(by=list(sort_cols), ascending=[True]*len(sort_cols))

longform = longform.reset_index()
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,left,right,target,r1,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice
0,2025-09-22 10:50:09+00:00,Maris444,c,6,practice,990,900,,practice,n,A,B,A,271.0,228.0,235.0,247.0,251.0,262.0,291.0,,2194.0,left
1,2025-09-22 10:50:09+00:00,Maris444,c,7,practice,991,901,,practice,y,D,C,C,278.0,219.0,212.0,288.0,246.0,303.0,379.0,,2967.0,right
2,2025-09-22 10:50:09+00:00,Maris444,c,8,practice,992,902,,practice,n,E,F,E,878.0,387.0,255.0,348.0,586.0,330.0,345.0,3074.0,4205.0,right
3,2025-09-22 10:50:09+00:00,Maris444,c,10,experiment,34,12,1.0,exclusive,y,12b,12a,12a,266.0,235.0,239.0,234.0,232.0,220.0,274.0,,854.0,right
4,2025-09-22 10:50:09+00:00,Maris444,c,11,experiment,66,21,2.0,other-directed-x,y,tűzoltó,orvos,orvos,513.0,319.0,216.0,207.0,204.0,,290.0,615.0,213.0,right


In [342]:
# Print how many items I have for each condition
print(longform['condition'].value_counts())

condition
practice            3
exclusive           1
other-directed-x    1
Name: count, dtype: int64


## Filtering experiments

In [343]:
# Assign experiments to exp1 or exp2 based on exp is 1 or 2
exp1 = longform[longform['exp'].astype(str).eq('1')].copy()
exp2 = longform[longform['exp'].astype(str).eq('2')].copy()

## Reading Times

In [344]:
# Replot: every item as an individual line, faceted by condition, color by participant
import plotly.express as px

# Rebuild melted if needed (one row per participant × item × region)
region_cols = [c for c in longform.columns if re.match(r"r\d+$", c)]
plot_df = longform.melt(
    id_vars=['participant_id', 'no', 'condition'],
    value_vars=region_cols,
    var_name='region',
    value_name='reading_time'
).dropna(subset=['reading_time'])

# Ensure region order
full_region_order = [f"r{i}" for i in range(1, 8)]
plot_df['region'] = pd.Categorical(plot_df['region'], categories=full_region_order, ordered=True)

# Create a unique line id per item (so each item is its own trace) and keep color = participant
plot_df['pid_item'] = plot_df['participant_id'].astype(str) + ' | no ' + plot_df['no'].astype(str)

fig = px.line(
    plot_df,
    x='region',
    y='reading_time',
    color='participant_id',        # color by participant
    line_group='pid_item',         # each item -> separate connected line
    facet_col='condition',
    category_orders={'region': full_region_order},
    markers=True,
    title='Reading times per item (each item = one line), faceted by condition, colored by participant',
    labels={'reading_time': 'Reading time (ms)', 'region': 'Region'}
)

fig.update_traces(mode='lines+markers', marker={'size':4}, opacity=0.7, hovertemplate=None)
fig.update_layout(legend_title_text='Participant', height=400)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # simplify facet labels
fig.show()

In [345]:
# Recompute STDs here (do not rely on precomputed r?_std columns)
import plotly.express as px

# Melt longform to one row per participant × item × region 
region_cols = [c for c in longform.columns if re.match(r"r\d+$", c)]
melted = (
    longform
    .melt(id_vars=['participant_id', 'item', 'condition'],
          value_vars=region_cols,
          var_name='region',
          value_name='reading_time')
    .dropna(subset=['reading_time'])
)

# Per-participant (across items) mean and std for each region × condition
pp_region_stats = (
    melted
    .groupby(['participant_id', 'condition', 'region'], as_index=False)
    .reading_time.agg(reading_time_mean='mean', reading_time_std='std')
)

# Per-participant question/choice mean+std across items (if present)
extras_pp = []
if 'question_rt' in longform.columns:
    q_pp = (
        longform
        .dropna(subset=['question_rt'])
        .groupby(['participant_id', 'condition'], as_index=False)
        .question_rt.agg(reading_time_mean='mean', reading_time_std='std')
    )
    q_pp['region'] = 'question'
    extras_pp.append(q_pp[['participant_id','condition','region','reading_time_mean','reading_time_std']])

if 'choice_rt' in longform.columns:
    c_pp = (
        longform
        .dropna(subset=['choice_rt'])
        .groupby(['participant_id', 'condition'], as_index=False)
        .choice_rt.agg(reading_time_mean='mean', reading_time_std='std')
    )
    c_pp['region'] = 'choice'
    extras_pp.append(c_pp[['participant_id','condition','region','reading_time_mean','reading_time_std']])

if extras_pp:
    pp_region_stats = pd.concat([pp_region_stats] + extras_pp, ignore_index=True, sort=False)

# Now aggregate across participants: use participant-level means to compute group mean and across-participant std
present_regions = pp_region_stats['region'].unique().tolist()
full_region_order = [f"r{i}" for i in range(1, 8)]
# keep consistent ordering and append extras if present
present_regions = [r for r in full_region_order if r in present_regions] + [r for r in ['question','choice'] if r in pp_region_stats['region'].unique()]

agg_plot = (
    pp_region_stats
    .groupby(['region', 'condition'], as_index=False)
    .reading_time_mean.agg(mean_reading_time='mean', std_reading_time='std')
)

# --- INSERT FALLBACK STD COMPUTATION HERE ---
# Use item-level (within-participant across-items) std as fallback when across-participant std is NaN
std_items = melted.groupby(['condition', 'region'])['reading_time'].std().reset_index().rename(columns={'reading_time':'std_items'})

if 'question_rt' in longform.columns:
    std_q = longform.dropna(subset=['question_rt']).groupby('condition')['question_rt'].std().reset_index().rename(columns={'question_rt':'std_items'})
    std_q['region'] = 'question'
    std_items = pd.concat([std_items, std_q[['condition','region','std_items']]], ignore_index=True)

if 'choice_rt' in longform.columns:
    std_c = longform.dropna(subset=['choice_rt']).groupby('condition')['choice_rt'].std().reset_index().rename(columns={'choice_rt':'std_items'})
    std_c['region'] = 'choice'
    std_items = pd.concat([std_items, std_c[['condition','region','std_items']]], ignore_index=True)

agg_plot = agg_plot.merge(std_items, on=['condition', 'region'], how='left')
agg_plot['std_reading_time'] = agg_plot['std_reading_time'].fillna(agg_plot['std_items'])
agg_plot.drop(columns=['std_items'], inplace=True)
# --- END INSERT ---

# Keep only present regions and set categorical ordering
agg_plot = agg_plot[agg_plot['region'].isin(present_regions)].copy()
agg_plot['region'] = pd.Categorical(agg_plot['region'], categories=present_regions, ordered=True)

# Plot 1: Reading times for regions (r1..r7) — error bars are across-participant std of participant means
region_only = agg_plot[agg_plot['region'].isin(full_region_order)]
fig1 = px.line(
    region_only,
    x='region',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    facet_col='condition',
    category_orders={'region': full_region_order},
    title='Mean Reading Times per Region<br>(Error Bars = Across-Participant Std Dev, Faceted by Condition)',
    labels={'mean_reading_time': 'Mean Reading Time (ms)', 'region': 'Region'}
)
fig1.update_traces(connectgaps=True, mode='lines+markers', marker={'size': 6}, line={'width': 2}, opacity=0.85)
fig1.update_yaxes(title_text='Mean reading time (ms)')
fig1.update_xaxes(title_text='Region')
fig1.update_layout(legend_title_text='Condition')
fig1.show()

# Plot 2: Question RTs
question_only = agg_plot[agg_plot['region'] == 'question']
fig2 = px.bar(
    question_only,
    x='condition',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    title='Mean Question RT by Condition (Error Bars = Std Dev)',
    labels={'mean_reading_time': 'Mean Question RT (ms)', 'condition': 'Condition'}
)
fig2.update_layout(legend_title_text='Condition')
fig2.show()

# Plot 3: Decision RTs
choice_only = agg_plot[agg_plot['region'] == 'choice']
fig3 = px.bar(
    choice_only,
    x='condition',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    title='Mean Decision RT by Condition (Error Bars = Std Dev)',
    labels={'mean_reading_time': 'Mean Decision RT (ms)', 'condition': 'Condition'}
)
fig3.update_layout(legend_title_text='Condition')
fig3.show()

## Eye-tracking

In [346]:
import pandas as pd
import requests
from io import StringIO

def download_and_save_eyetracking_data():
    out_dir = 'eyetracking_data'
    eturl = "https://mondo1.dreamhosters.com/script.php?experiment="
    for _, row in participants_df.iterrows():
        et_filename = row['eyetracker_filename']
        participant_id = row['participant_id']
        if pd.notnull(et_filename) and pd.notnull(participant_id):
            et_file = eturl + et_filename
            try:
                r = requests.get(et_file, timeout=15)
                r.raise_for_status()
                df_et = pd.read_csv(StringIO(r.text))
                df_et.to_csv(f"{out_dir}/{participant_id}.csv", index=False)
            except Exception as e:
                print(f"Failed for {participant_id}: {e}")
    print("Done!")
# Example usage:
download_and_save_eyetracking_data()

Done!


In [347]:
# Fix eyetracking csv trial numbering
import os
import pandas as pd
from glob import glob

et_dir = os.path.join(ROOT, 'eyetracking_data')

for csv_file in glob(os.path.join(et_dir, '*.csv')):
    df = pd.read_csv(csv_file)
    # Apply fix_trial_numbering WITHOUT changing row order
    trial_col = 'trial' if 'trial' in df.columns else [c for c in df.columns if 'trial' in c][0]
    # Map 9,10,11 to 6,7,8
    mapping = {9: 6, 10: 7, 11: 8}
    df[trial_col] = df[trial_col].replace(mapping)
    # Find the index of the last occurrence of 8
    last_mapped_idx = df[df[trial_col] == 8].index.max()
    if last_mapped_idx is not None and last_mapped_idx + 1 < len(df):
        next_trial = 10
        prev_trial_val = None
        for i in range(last_mapped_idx + 1, len(df)):
            current_trial_val = df.at[i, trial_col]
            if prev_trial_val is not None and current_trial_val != prev_trial_val:
                next_trial += 1
            df.at[i, trial_col] = next_trial
            prev_trial_val = current_trial_val
    df.to_csv(csv_file, index=False)
print("All eyetracking files fixed and overwritten.")

All eyetracking files fixed and overwritten.


In [348]:
# Eye tracking data with filtering out neutral gaze
import os
import pandas as pd
import numpy as np
import plotly.express as px
from glob import glob

# Directory containing eyetracking data
et_dir = os.path.join(ROOT, 'eyetracking_data')

# Helper to parse a single participant's eyetracking file
def parse_eyetracking_file(filepath):
    df = pd.read_csv(filepath)
    # df = fix_trial_numbering(df).copy()
    df.columns = [c.lower() for c in df.columns]
    # Drop rows where both left and right canvas are 0 (neutral gaze) 
    df = df[~((df['_left_canvas'] == 0) & (df['_right_canvas'] == 0))].copy()
    # Identify trial number
    if 'trial' in df.columns:
        trial_col = 'trial'
    else:
        trial_col = [c for c in df.columns if 'trial' in c][0]
    return df, trial_col

# Aggregate results for all participants except those to remove
et_results = []
for csv_file in glob(os.path.join(et_dir, '*.csv')):
    participant_id = os.path.splitext(os.path.basename(csv_file))[0]
    if participant_id in list_of_participants_to_remove:
        continue
    df, trial_col = parse_eyetracking_file(csv_file)
    for trial, g in df.groupby(trial_col):
        dwell_left = g.loc[g['_left_canvas'] == 1, 'times'].max() / 1000
        dwell_right = g.loc[g['_right_canvas'] == 1, 'times'].max() / 1000
        total_dwell = dwell_left + dwell_right
        prop_left = dwell_left / total_dwell if total_dwell > 0 else np.nan
        prop_right = dwell_right / total_dwell if total_dwell > 0 else np.nan
        dominant = 'left' if dwell_left > dwell_right else 'right'
        left_changes = (g['_left_canvas'] != g['_left_canvas'].shift()).astype(int)
        n_fix_left = ((g['_left_canvas'] == 1) & (left_changes == 1)).sum()
        right_changes = (g['_right_canvas'] != g['_right_canvas'].shift()).astype(int)
        n_fix_right = ((g['_right_canvas'] == 1) & (right_changes == 1)).sum()
        canvas_state = np.where(g['_left_canvas'] == 1, 'L', np.where(g['_right_canvas'] == 1, 'R', 'N'))
        transitions = np.sum(canvas_state[1:] != canvas_state[:-1])
        et_results.append({
            'participant_id': participant_id,
            'trial': trial,
            'dominant': dominant,
            'prop_left': round(prop_left, 2) if not np.isnan(prop_left) else np.nan,
            'prop_right': round(prop_right, 2) if not np.isnan(prop_right) else np.nan,
            'total_dwell': round(total_dwell, 2),
            'dwell_left': dwell_left,
            'dwell_right': dwell_right,
            'n_fix_left': n_fix_left,
            'n_fix_right': n_fix_right,
            'transitions': transitions
        })

et_df = pd.DataFrame(et_results)

# Remove certain participants
et_df = et_df[~et_df['participant_id'].isin(list_of_participants_to_remove)]
et_df

Unnamed: 0,participant_id,trial,dominant,prop_left,prop_right,total_dwell,dwell_left,dwell_right,n_fix_left,n_fix_right,transitions
0,Gabor111,6,left,0.73,0.27,4.03,2.951,1.081,2,1,2
1,Gabor111,7,right,0.45,0.55,14.29,6.433,7.853,13,14,26
2,Gabor111,8,left,0.61,0.39,6.38,3.893,2.486,15,14,28
3,Gabor222,6,left,0.53,0.47,8.84,4.725,4.117,2,2,3
4,Gabor222,7,right,0.22,0.78,2.32,0.515,1.805,4,5,8
5,Gabor222,8,left,0.59,0.41,11.7,6.901,4.8,12,12,23
6,Maris111,6,left,0.98,0.02,4.58,4.484,0.094,2,1,2
7,Maris111,7,right,0.43,0.57,9.75,4.154,5.601,7,6,12
8,Maris111,8,left,0.52,0.48,13.71,7.187,6.524,11,11,21
9,Maris111,10,left,0.79,0.21,3.8,2.987,0.816,3,2,4


## Merge ET and behavioral data

In [349]:
# Print count values of participant_id 
print(et_df['participant_id'].value_counts())

participant_id
Maris444    5
Maris111    4
Maris222    4
Gabor111    3
Gabor222    3
Yun777      3
Yun999      3
Name: count, dtype: int64


In [350]:
# Overwrite 'trial' values in et_df per participant using the order from region_mean/trial_index
for participant in et_df['participant_id'].unique():
    # Get the ordered list of trial numbers for this participant from region_mean
    # region_mean index: (results_time, participant_id, group, trial, label, no, item, condition, cb, left, right)
    # Extract trial numbers for this participant
    idx = region_mean.index
    participant_trials = [i[3] for i in idx if i[1] == participant]
    
    # Get indices in et_df for this participant
    mask = et_df['participant_id'] == participant
    n_trials = mask.sum()
    
    # Only overwrite if counts match
    if len(participant_trials) == n_trials:
        et_df.loc[mask, 'trial'] = participant_trials
    else:
        print(f"Warning: trial count mismatch for {participant} (region_mean: {len(participant_trials)}, et_df: {n_trials})")
        
# # Merge eyetracking data with longform data on participant_id
# longform = pd.merge(
#     longform,
#     et_df,
#     how='left',
#     left_on=['participant_id', 'trial'],
#     right_on=['participant_id', 'trial']
# )

# longform

# Merge eyetracking data with longform data on participant_id
et_df = pd.merge(
    longform,
    et_df,
    how='left',
    left_on=['participant_id', 'trial'],
    right_on=['participant_id', 'trial']
)

et_df



Unnamed: 0,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,left,right,target,r1,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,dominant,prop_left,prop_right,total_dwell,dwell_left,dwell_right,n_fix_left,n_fix_right,transitions
0,2025-09-22 10:50:09+00:00,Maris444,c,6,practice,990,900,,practice,n,A,B,A,271.0,228.0,235.0,247.0,251.0,262.0,291.0,,2194.0,left,left,0.7,0.3,3.08,2.167,0.918,2,1,2
1,2025-09-22 10:50:09+00:00,Maris444,c,7,practice,991,901,,practice,y,D,C,C,278.0,219.0,212.0,288.0,246.0,303.0,379.0,,2967.0,right,right,0.48,0.52,5.56,2.673,2.89,7,8,14
2,2025-09-22 10:50:09+00:00,Maris444,c,8,practice,992,902,,practice,n,E,F,E,878.0,387.0,255.0,348.0,586.0,330.0,345.0,3074.0,4205.0,right,right,0.46,0.54,7.73,3.584,4.149,8,9,16
3,2025-09-22 10:50:09+00:00,Maris444,c,10,experiment,34,12,1.0,exclusive,y,12b,12a,12a,266.0,235.0,239.0,234.0,232.0,220.0,274.0,,854.0,right,right,0.36,0.64,1.3,0.467,0.833,2,3,4
4,2025-09-22 10:50:09+00:00,Maris444,c,11,experiment,66,21,2.0,other-directed-x,y,tűzoltó,orvos,orvos,513.0,319.0,216.0,207.0,204.0,,290.0,615.0,213.0,right,right,0.34,0.66,1.76,0.601,1.158,2,3,4


## Plot ET

In [351]:
import plotly.graph_objects as go
transparent_color = 'hsva(0, 0%, 100%, 0)'
fig = go.Figure()

# Collect all y-tick labels and which should be bold
yticks = []
yticks_bold = set()

for participant in [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]:
    df_part = et_df[et_df['participant_id'] == participant]
    sorted_trials = sorted(df_part['trial'].unique(), key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x)
    for trial in sorted_trials:
        label = f"{participant} - Trial {trial}"
        yticks.append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        if dominant is not None and choice is not None and dominant == choice:
            yticks_bold.add(label)

        et_file = os.path.join(et_dir, f"{participant}.csv")
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        times = g['times'].values  # times are timestamps in ms
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000  # duration in seconds
            else:
                seg_time = 0
            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                side = 'Left'
                color = 'steelblue'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                side = 'Right'
                color = 'orange'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                side = 'None'
                color = transparent_color
            else:
                side = 'Both'
                color = "red" #transparent
            fig.add_trace(go.Bar(
                x=[seg_time],
                y=[label],
                name=side,
                marker_color=color,
                orientation='h',
                showlegend=False
            ))

# Build tickvals and ticktext with bolding
tickvals = yticks
ticktext = [f"<b>{t}</b>" if t in yticks_bold else t for t in yticks]

fig.update_layout(
    template='plotly_white', #_dark
    barmode='stack',
    title='Eye-Tracking: Stacked Dwell Time Segments (Transitions) per Trial',
    xaxis_title='Seconds',
    yaxis_title='Participant - Trial',
    legend_title='Side',
    height=1200,
    yaxis=dict(
        tickvals=tickvals,
        ticktext=ticktext
    )
)
fig.show()