# Parse PCIbex results

This notebook parses the messy results file `results_dev.csv` and produces an event-level and longform DataFrame with:

- participant_id, group, label (block), no, item, condition, cb, left, right
- Self-paced reading region events (r1..r7) with per-region RTs, including mean and std per participant, item, condition, region question RTs and choice selection (side + RT)
- Eye-tracker events

In [128]:
# Imports and paths
import re
import os
from glob import glob
from pathlib import Path
import pandas as pd
import numpy as np
import requests
from io import StringIO
import plotly.express as px

ROOT = Path(r"c:\\Users\\parti\\Projects\\pcibex-hun")
raw_file = ROOT / "results_prod.csv"
plots = ROOT / "plots"
plots.mkdir(exist_ok=True)

print("Will parse:", raw_file)

Will parse: c:\Users\parti\Projects\pcibex-hun\results_prod.csv


In [129]:
# Nord-themed Plotly template (dark background, snow fonts, nord colorway)
import plotly.io as pio
import plotly.graph_objects as go

nord = ['#5e81ac', '#a3be8c', '#ebcb8b', '#d19a66', '#bf616a', "#b97ec9", '#4c566a', '#8fbcbb', '#88c0d0', '#81a1c1',]
dark = ['#2e3440', '#3b4252', '#434c5e', '#4c566a']
snow = ['#d8dee9', '#e5e9f0', '#eceff4']
transparent = 'rgba(0,0,0,0)'

nord_template = go.layout.Template()

# general layout
nord_template.layout.paper_bgcolor = dark[0]
nord_template.layout.plot_bgcolor = dark[1]
nord_template.layout.colorway = nord
nord_template.layout.font = dict(color=snow[0], family="Arial, sans-serif", size=12)
nord_template.layout.title = dict(font=dict(color=snow[1], size=16))
nord_template.layout.legend = dict(font=dict(color=snow[0]))
# nord_template.layout.margin = dict(l=40, r=20, t=60, b=40)

# axis defaults
axis_defaults = dict(
    title=dict(font=dict(color=snow[0])),
    tickfont=dict(color=snow[0]),
    gridcolor='rgba(255,255,255,0.06)',
    zerolinecolor='rgba(255,255,255,0.06)',
    linecolor='rgba(255,255,255,0.12)'
)
nord_template.layout.xaxis = axis_defaults
nord_template.layout.yaxis = axis_defaults

# annotation defaults (facets titles etc.)
nord_template.layout.annotationdefaults = dict(font=dict(color=snow[1]))

# register template
pio.templates['nord_dark'] = nord_template
# optional: make it default
# pio.templates.default = 'nord_dark'

In [130]:
# Read file, split comments vs data
raw_lines = raw_file.read_text(encoding="utf-8").splitlines()

# Replace "6108da57e362f96a3ee32a88" with "something" if line number is <4000
for i, line in enumerate(raw_lines):
    if i > 5000:
        raw_lines[i] = line.replace("6108da57e362f96a3ee32a88", "6108da57e362f96a3ee32a88_2")

for i, line in enumerate(raw_lines):
    if i > 5000:
        raw_lines[i] = line.replace("5dade76a4860f70017f70ec5", "5dade76a4860f70017f70ec5_2")

header_comments = []
rows = []
for line in raw_lines:
    if line.startswith("#"):
        header_comments.append(line)
    elif line.strip():
        rows.append(line)

print(f"Comment lines: {len(header_comments)} | Data rows: {len(rows)}")
rows[:3]

Comment lines: 1632 | Data rows: 25529


['1758558839,5ea96059c7f8c9bb71d4ecdb67ea496e,PennController,0,0,welcome,NULL,PennController,0,_Trial_,Start,1758556961557,prolific_id,NULL',
 '1758558839,5ea96059c7f8c9bb71d4ecdb67ea496e,PennController,0,0,welcome,NULL,PennController,0,_Header_,Start,1758556961557,prolific_id,NULL',
 '1758558839,5ea96059c7f8c9bb71d4ecdb67ea496e,PennController,0,0,welcome,NULL,PennController,0,_Header_,End,1758556961557,prolific_id,NULL']

## Events

In [131]:
# Define the base schema described by comments just before each block
base_cols = [
    "ResultsTime", "MD5", "Controller", "Order", "Inner", "Label",
    "LatinSquare", "PennElementType", "PennElementName", "Parameter",
    "Value", "EventTime", "prolific_id",
]

def extract_trial_fields_from_comments(comments):
    # We expect something like a line that mentions these names.
    allowed = ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]
    # accept common variants (participant-id, participant id, participantid, prolific_id etc.)
    pattern = re.compile(r"(prolific[_\-\s]?id|participant[_\-\s]?id|participantid|group|no|item|exp|condition|cb|left|right|target)", re.I)

    best_line = None
    max_hits = 0
    # prefer the comment line that contains the most known tokens
    for c in comments:
        hits = len(pattern.findall(c))
        if hits > max_hits:
            best_line = c
            max_hits = hits

    # if we found a useful header line, try to parse explicit comma-separated tokens after a colon
    if best_line and max_hits >= 6:
        text = best_line.split(":", 1)[-1]
        tokens = [t.strip() for t in re.split(r"[,\t]+", text) if t.strip()]

        def norm(s):
            s2 = re.sub(r"[^A-Za-z0-9_]", "", s).lower()
            if s2 == "participantid":
                s2 = "participant_id"
            if s2 == "prolificid":
                s2 = "prolific_id"
            return s2

        names = []
        for t in tokens:
            n = norm(t)
            if n in allowed and n not in names:
                names.append(n)

        # fallback: if explicit tokens not present, use the order of regex matches in the line
        if not names:
            for m in pattern.finditer(best_line):
                n = norm(m.group(0))
                if n in allowed and n not in names:
                    names.append(n)

        # Preserve canonical order relative to allowed list
        ordered = [name for name in allowed if name in names]
        # accept extraction if we found at least participant_id + several others
        if "participant_id" in ordered and len(ordered) >= 6:
            return ordered

    # Final fallback: canonical full trial ordering (updated to include exp and target)
    return ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]

TRIAL_FIELDS = extract_trial_fields_from_comments(header_comments)
print("Using trial fields:", TRIAL_FIELDS)

# Parse each data row into a record using base_cols + TRIAL_FIELDS depending on Label

def parse_row_to_record(line: str):
    parts = [p.strip() for p in line.split(",")]
    rec = {}
    base_vals = parts[:len(base_cols)]
    extra_vals = parts[len(base_cols):]

    for k, v in zip(base_cols, base_vals):
        rec[k] = v

    # If the extra values accidentally include a literal header token as the first element
    # (some CSV dumps include the header name), drop it so alignment works.
    if extra_vals and str(extra_vals[0]).lower() in {f.lower() for f in TRIAL_FIELDS + ["prolific_id"]}:
        # drop single leading header-like token
        extra_vals = extra_vals[1:]

    label = rec.get("Label")
    if label in ("practice", "experiment"):
        n = min(len(extra_vals), len(TRIAL_FIELDS))
        for k, v in zip(TRIAL_FIELDS[:n], extra_vals[:n]):
            rec[k] = v
    elif label in ("participant_data",):
        if extra_vals:
            # participant_data frequently stores id in first extra token (after possible header token)
            rec["participant_id"] = extra_vals[0]

    return rec

records = [parse_row_to_record(l) for l in rows]
raw_df = pd.DataFrame.from_records(records)

# Cast some known numeric columns where possible
for c in ["ResultsTime", "Order", "Inner", "EventTime", "no", "item"]:
    if c in raw_df.columns:
        raw_df[c] = pd.to_numeric(raw_df[c], errors="coerce")

# Human-readable timestamps
raw_df["results_time"] = pd.to_datetime(raw_df["ResultsTime"], unit="s", utc=True)

# IMPORTANT: Keep EventTime both as numeric milliseconds and as a proper timestamp (UTC)
# - EventTime_ms: numeric milliseconds for computations (diffs, means)
# - event_time: Pandas Timestamp (UTC) for readability
raw_df["event_time_ms"] = raw_df["EventTime"]
raw_df["event_time"] = pd.to_datetime(raw_df["event_time_ms"], unit="ms", utc=True)

################################################

# Helper: derive per-trial fields now that we named extras explicitly in raw_df
# We keep a light decoder only to ensure participant_id is present and to normalize types.

EXPECTED_FIELDS = ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]

# Make sure that item and no are integers
for col in ["no", "item"]:
    if col in raw_df.columns:
        # Use pandas nullable Int64 type to keep NaNs and force integer dtype
        raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce").astype('Int64')
        
# Ensure all expected columns exist even if missing in some rows
for col in EXPECTED_FIELDS:
    if col not in raw_df.columns:
        raw_df[col] = None

# Build df and forward fill participant_id/group as before
df = raw_df.copy()

# Forward-fill participant_id only within blocks of identical participant_id (no cross-over)
# (keep as-is: do not fill across participants)
if "participant_id" in df.columns:
    # nothing to do beyond keeping column present; participant ids should appear in rows where provided

    # If you need to forward-fill within contiguous blocks uncomment:
    # df['participant_id'] = df['participant_id'].ffill()

    pass

# Backward-fill group only within blocks of identical participant_id (no cross-over)
if "group" in df.columns and "participant_id" in df.columns:
    # normalize empty/NULL then backfill per participant using transform to avoid groupby.apply deprecation
    df["group"] = df["group"].replace({"": None, "NULL": None})
    df["group"] = df.groupby("participant_id")["group"].transform(lambda s: s.bfill())

# Derive block-type flags
# df["is_practice"] = df["Label"].eq("practice")
# df["is_experiment"] = df["Label"].eq("experiment")

# For convenience: also include a local-time copy if desired (commented)
# df["results_timestamp_local"] = df["results_timestamp"].dt.tz_convert("Europe/Budapest")

# Drop Controller column
df.drop(columns=["ResultsTime", "Controller", "Inner", "LatinSquare", "EventTime", "prolific_id"], inplace=True)


# Drop rows where the Parameter column starts and ends with '_'
df = df[~df["Parameter"].str.match(r"^_.*_$", na=False)]

# Remove rows where PennElementType is "Canvas"
df = df[df["PennElementType"] != "Canvas"]

# Add simple elapsed time between events: current row EventTime_ms minus previous row EventTime_ms
if 'elapsed_ms' in df.columns:
    df.drop(columns=['elapsed_ms'], inplace=True)
df['elapsed_ms'] = df['event_time_ms'].diff()
df.drop(columns=['event_time_ms'], inplace=True)

# Keep only rows where label is practice or experiment
df = df[df["Label"].isin(["practice", "experiment"])]

# In Values column rename right_canvas_practice to right_canvas
df["Value"] = df["Value"].replace({"right_canvas_practice": "right_canvas"})
df["Value"] = df["Value"].replace({"left_canvas_practice": "left_canvas"})

# Rename Label to label and Order to order
df.rename(columns={"Label": "label", "Order": "trial"}, inplace=True)

# Fix mistakes in stimuli data ##########################
# Change condition of sentences no 74 to other-directed-x
df.loc[df['no'] == 74, 'condition'] = 'other-directed-x'

# Save df as events.csv
df.to_csv("events.csv", index=False)
df

Using trial fields: ['participant_id', 'group', 'no', 'item', 'exp', 'condition', 'cb', 'left', 'right', 'target']


Unnamed: 0,MD5,trial,label,PennElementType,PennElementName,Parameter,Value,participant_id,group,no,item,exp,condition,cb,left,right,target,results_time,event_time,elapsed_ms
28,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,EyeTracker,tracker,calibration,68,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:00.501000+00:00,40689.0
29,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,EyeTracker,tracker,Filename,httpsfarmpcibexnetpzCPVqO/e48687db-1afc-905a-9...,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:26.157000+00:00,25656.0
31,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,Key,r0,PressedKey,,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:08.205000+00:00,-17952.0
32,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,Key,r1,PressedKey,,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:10.936000+00:00,2731.0
33,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,Key,r2,PressedKey,,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:12.758000+00:00,1822.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25523,e8448cd14ced0c59e2e1bc1bf736f01c,50,experiment,Key,r4,PressedKey,,61242dd5be1a06b174975a1f,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-18 13:04:05+00:00,2025-10-18 13:03:56.332000+00:00,496.0
25524,e8448cd14ced0c59e2e1bc1bf736f01c,50,experiment,Key,r5,PressedKey,,61242dd5be1a06b174975a1f,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-18 13:04:05+00:00,2025-10-18 13:03:56.850000+00:00,518.0
25525,e8448cd14ced0c59e2e1bc1bf736f01c,50,experiment,Key,r7,PressedKey,,61242dd5be1a06b174975a1f,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-18 13:04:05+00:00,2025-10-18 13:03:57.366000+00:00,516.0
25526,e8448cd14ced0c59e2e1bc1bf736f01c,50,experiment,Key,question,PressedKey,,61242dd5be1a06b174975a1f,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-18 13:04:05+00:00,2025-10-18 13:03:59.605000+00:00,2239.0


## Participants

In [132]:
def extract_participant_info(events_df, fallback_df=None):
    # Helper: return first non-empty normalized value for a column
    def pick_first(df_like, col):
        if df_like is None or col not in df_like.columns:
            return None
        s = df_like[col].replace({"": None, "NULL": None}).dropna().astype(str)
        return s.iloc[0] if len(s) > 0 else None

    # Extract participant_id, group, results_time (prefer events_df, fallback to fallback_df)
    pid = pick_first(events_df, "participant_id") or pick_first(fallback_df, "participant_id")
    group = pick_first(events_df, "group") or pick_first(fallback_df, "group")
    results_time = pick_first(events_df, "results_time") or pick_first(fallback_df, "results_time")

    # Exact requirement: Value where PennElementType == 'EyeTracker' and Parameter == 'Filename'
    def find_et_filename(df_like):
        if df_like is None or not {"PennElementType", "Parameter", "Value"}.issubset(df_like.columns):
            return None
        pet = df_like["PennElementType"].astype(str).str.lower()
        par = df_like["Parameter"].astype(str).str.lower()
        mask = pet.eq("eyetracker") & par.eq("filename")
        vals = df_like.loc[mask, "Value"].replace({"": None}).dropna().astype(str)
        return vals.iloc[0] if len(vals) > 0 else None

    et = find_et_filename(events_df) or find_et_filename(fallback_df)

    return pd.DataFrame([
        {
            "participant_id": pid,
            "group": group,
            "eyetracker_filename": et,
            "results_time": results_time,
        }
    ])

# Build participants_df for all participants (uses raw_df as fallback to find filename)
parts = []
df_nonnull = df[df['participant_id'].notna()]
for pid, g in df_nonnull.groupby('participant_id', sort=False):
    parts.append(extract_participant_info(g, fallback_df=raw_df))
participants_df = pd.concat(parts, ignore_index=True)
participants_df

Unnamed: 0,participant_id,group,eyetracker_filename,results_time
0,60d26e7cd9f0761e4d12b9f8,a,httpsfarmpcibexnetpzCPVqO/e48687db-1afc-905a-9...,2025-09-22 16:33:59+00:00
1,6108da57e362f96a3ee32a88,b,httpsfarmpcibexnetpzCPVqO/9f995578-7c21-5dd7-3...,2025-09-22 17:12:07+00:00
2,5dade76a4860f70017f70ec5,c,httpsfarmpcibexnetpzCPVqO/1a69c7d4-6bc1-013f-8...,2025-09-22 18:23:58+00:00
3,6151b07ac0d164fdd7e53100,a,httpsfarmpcibexnetpzCPVqO/385ec604-d53c-5512-e...,2025-09-24 17:17:21+00:00
4,5f4c042383588080d02e61a3,b,httpsfarmpcibexnetpzCPVqO/7477329b-3787-b3ba-8...,2025-09-24 19:43:41+00:00
5,5f338ba6ea047119dbd6e49e,c,httpsfarmpcibexnetpzCPVqO/329133cc-4271-bc2c-5...,2025-09-25 06:25:58+00:00
6,5e3b29dc87243b34bde5abfa,a,httpsfarmpcibexnetpzCPVqO/c82d96bc-20aa-db20-9...,2025-09-25 08:49:56+00:00
7,6154d933a58bf7bcd9e81fed,b,httpsfarmpcibexnetpzCPVqO/7aa9138e-dc0b-fb8e-b...,2025-09-25 09:33:24+00:00
8,5d4fe6a2ffbcf800019d5e54,c,httpsfarmpcibexnetpzCPVqO/b0598184-0810-4d35-b...,2025-09-25 17:42:12+00:00
9,60cb4cd6477b2ff7c1adaea4,a,httpsfarmpcibexnetpzCPVqO/445006f5-4422-6c8b-6...,2025-09-25 21:45:59+00:00


In [133]:
# Remove certain participants
# No ET data: 641379405684937e6fad9f1b
# Too long times: 642b35c70771761602e9c3ae
# Duplicate test: 5dade76a4860f70017f70ec5_2
list_of_participants_to_remove = ['642b35c70771761602e9c3ae', '641379405684937e6fad9f1b', '5dade76a4860f70017f70ec5_2', 'parti_test_01'] ####################################################################
participants_df = participants_df[~participants_df['participant_id'].isin(list_of_participants_to_remove)]
df = df[~df['participant_id'].isin(list_of_participants_to_remove)]

participants_df

Unnamed: 0,participant_id,group,eyetracker_filename,results_time
0,60d26e7cd9f0761e4d12b9f8,a,httpsfarmpcibexnetpzCPVqO/e48687db-1afc-905a-9...,2025-09-22 16:33:59+00:00
1,6108da57e362f96a3ee32a88,b,httpsfarmpcibexnetpzCPVqO/9f995578-7c21-5dd7-3...,2025-09-22 17:12:07+00:00
2,5dade76a4860f70017f70ec5,c,httpsfarmpcibexnetpzCPVqO/1a69c7d4-6bc1-013f-8...,2025-09-22 18:23:58+00:00
3,6151b07ac0d164fdd7e53100,a,httpsfarmpcibexnetpzCPVqO/385ec604-d53c-5512-e...,2025-09-24 17:17:21+00:00
4,5f4c042383588080d02e61a3,b,httpsfarmpcibexnetpzCPVqO/7477329b-3787-b3ba-8...,2025-09-24 19:43:41+00:00
5,5f338ba6ea047119dbd6e49e,c,httpsfarmpcibexnetpzCPVqO/329133cc-4271-bc2c-5...,2025-09-25 06:25:58+00:00
6,5e3b29dc87243b34bde5abfa,a,httpsfarmpcibexnetpzCPVqO/c82d96bc-20aa-db20-9...,2025-09-25 08:49:56+00:00
7,6154d933a58bf7bcd9e81fed,b,httpsfarmpcibexnetpzCPVqO/7aa9138e-dc0b-fb8e-b...,2025-09-25 09:33:24+00:00
8,5d4fe6a2ffbcf800019d5e54,c,httpsfarmpcibexnetpzCPVqO/b0598184-0810-4d35-b...,2025-09-25 17:42:12+00:00
9,60cb4cd6477b2ff7c1adaea4,a,httpsfarmpcibexnetpzCPVqO/445006f5-4422-6c8b-6...,2025-09-25 21:45:59+00:00


In [134]:
# Value counts for group
participants_df['group'].value_counts(dropna=False)

group
a    10
b    10
c    10
Name: count, dtype: int64

## Longform events

In [135]:
# Filter for experiment/practice trials only
df_trials = df[df['label'].isin(['experiment', 'practice'])].copy()

# Sort for deterministic grouping
df_trials = df_trials.sort_values(['participant_id', 'no', 'item'])

# Region RTs: r1..r7
region_names = [f"r{i}" for i in range(1, 8)]
is_region = df_trials['PennElementName'].str.lower().isin(region_names) & df_trials['Parameter'].str.lower().eq('pressedkey')
regions = df_trials[is_region].copy()
regions['region_idx'] = regions['PennElementName'].str.extract(r'r(\d)')[0].astype(int)

# Build a full trial index to ensure all trials are present
# NOTE: added 'exp' and 'target' to match new appended columns ordering
trial_index_cols = ['results_time', 'participant_id', 'group', 'trial', 'label', 'no', 'item', 'exp', 'condition', 'cb', 'left', 'right', 'target']
# Ensure trial_index_cols exist in df_trials before selecting (EXPECTED_FIELDS earlier guarantees presence)
trial_index = df_trials.drop_duplicates(subset=[c for c in trial_index_cols if c in df_trials.columns])[[c for c in trial_index_cols if c in df_trials.columns]].sort_values([c for c in trial_index_cols if c in df_trials.columns])

# Compute region RTs per trial (item) using average of elapsed_ms
region_mean = regions.pivot_table(
    index=[c for c in trial_index_cols if c in regions.columns],
    columns='region_idx',
    values='elapsed_ms',
    aggfunc='mean',
    fill_value=np.nan
)

# Reindex to include any trials that had no region rows
region_mean = region_mean.reindex(trial_index.set_index([c for c in trial_index_cols if c in trial_index.columns]).index, fill_value=np.nan)

# Rename numeric region columns to r1..r7
region_mean = region_mean.rename(columns={i: f"r{int(i)}" for i in region_mean.columns})

# Question RT: average elapsed_ms for question keypress
is_question = df_trials['PennElementName'].str.lower().eq('question') & df_trials['Parameter'].str.lower().eq('pressedkey')
questions = df_trials[is_question].copy()
questions = questions.sort_values(['participant_id', 'no', 'item'])
questions['question_rt'] = questions['elapsed_ms']
question_rt = questions.groupby([c for c in trial_index_cols if c in questions.columns])['question_rt'].mean()

# Choice RT and value: average elapsed_ms for choice selection
is_choice = df_trials['PennElementType'].str.lower().eq('selector') & df_trials['Parameter'].str.lower().eq('selection')
choices = df_trials[is_choice].copy()
choices = choices.sort_values(['MD5', 'participant_id', 'no', 'item'])
choices['choice_rt'] = choices['elapsed_ms']
choices_value = choices.groupby([c for c in trial_index_cols if c in choices.columns])['Value'].first()
choice_rt = choices.groupby([c for c in trial_index_cols if c in choices.columns])['choice_rt'].mean()

# Assemble longform DataFrame
longform = region_mean.copy()

longform['question_rt'] = question_rt
longform['choice_rt'] = choice_rt
longform['choice'] = choices_value

# Rename choice values from left_canvas/right_canvas to left/right
longform['choice'] = longform['choice'].replace({"left_canvas": "left", "right_canvas": "right"})

################################################################
# Subtract fixed offsets (keep units consistent with elapsed_ms)
longform['question_rt'] = longform['question_rt'] - 1000
longform['choice_rt'] = longform['choice_rt'] - 3000

# Sort by participant and trial for deterministic ordering
# If 'trial' missing in index columns fallback to sorting by ['participant_id', 'no', 'item']
sort_cols = ['participant_id', 'trial'] if 'trial' in longform.index.names else ['participant_id', 'no', 'item']
longform = longform.sort_values(by=list(sort_cols), ascending=[True]*len(sort_cols))

longform = longform.reset_index()
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r1,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice
0,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,6,practice,990,1,,practice,n,...,5692.0,2251.0,1551.0,1366.0,1200.0,1379.0,1429.0,,2343.0,left
1,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,7,practice,991,2,,practice,y,...,1349.0,1314.0,1077.0,901.0,901.0,922.0,900.0,,1227.0,right
2,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,8,practice,992,3,,practice,n,...,1017.0,972.0,1032.0,870.0,815.0,1278.0,1240.0,2116.0,1256.0,left
3,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,10,experiment,15,5,1,contrastive,n,...,1239.0,947.0,1051.0,762.0,845.0,943.0,1159.0,,2306.0,left
4,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,11,experiment,94,28,2,other-directed-x,y,...,836.0,902.0,843.0,895.0,1114.0,,1316.0,3357.0,2077.0,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,45,experiment,63,21,2,self-directed,n,...,499.0,518.0,608.0,648.0,813.0,,1254.0,1335.0,2216.0,right
1306,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,46,experiment,96,29,2,other-directed,y,...,710.0,908.0,806.0,812.0,1006.0,,1622.0,797.0,1985.0,left
1307,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,467.0,521.0,573.0,521.0,512.0,810.0,1227.0,,16275.0,left
1308,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,48,experiment,75,24,2,self-directed,n,...,808.0,780.0,606.0,750.0,887.0,,1484.0,791.0,1782.0,right


## Attention checks

In [136]:
# # Filter longform for attention-check condition
# attention_check_rows = longform[longform['condition'] == 'attention-check']
# # Show rows where choice does not match target
# failed_attention_checks = attention_check_rows[attention_check_rows['choice'] != attention_check_rows['target']]
# failed_attention_checks

# Refined attention check: match choice to the column (left or right) that matches target
def check_attention(row):
    if row['choice'] == 'left':
        return row['target'] == row['left']
    elif row['choice'] == 'right':
        return row['target'] == row['right']
    return False

attention_check_rows = longform[longform['condition'] == 'attention-check'].copy()
attention_check_rows['attention_pass'] = attention_check_rows.apply(check_attention, axis=1)

# Show rows where attention check failed
failed_attention_checks = attention_check_rows[~attention_check_rows['attention_pass']]
failed_attention_checks

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,attention_pass
604,2025-09-25 21:45:59+00:00,60cb4cd6477b2ff7c1adaea4,a,44,experiment,111,33,,attention-check,n,...,244.0,206.0,201.0,346.0,,371.0,,1958.0,right,False


## Calculate expectations/mismatch

In [137]:
# Find rows where target is available and does not match the chosen side's value
def choice_matches_target(row):
    if pd.isna(row['choice']) or pd.isna(row['target']):
        return True  # skip if missing
    if row['choice'] == 'left':
        return row['target'] == row['left']
    elif row['choice'] == 'right':
        return row['target'] == row['right']
    return False

mismatch_rows = longform[
    longform['target'].notna() &
    longform['choice'].notna() &
    (~longform.apply(choice_matches_target, axis=1))
].copy()

# Filter mismatch_rows to exclude any rows where 'target' is missing or empty string
mismatch_rows = mismatch_rows[mismatch_rows['target'].notna() & (mismatch_rows['target'] != '')].copy()
mismatch_rows

# Add a new column 'mismatch' to the longform to indicate these items where the choice does not match the target
longform['mismatch'] = False
longform.loc[mismatch_rows.index, 'mismatch'] = True
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,mismatch
0,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,6,practice,990,1,,practice,n,...,2251.0,1551.0,1366.0,1200.0,1379.0,1429.0,,2343.0,left,False
1,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,7,practice,991,2,,practice,y,...,1314.0,1077.0,901.0,901.0,922.0,900.0,,1227.0,right,False
2,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,8,practice,992,3,,practice,n,...,972.0,1032.0,870.0,815.0,1278.0,1240.0,2116.0,1256.0,left,False
3,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,10,experiment,15,5,1,contrastive,n,...,947.0,1051.0,762.0,845.0,943.0,1159.0,,2306.0,left,True
4,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,11,experiment,94,28,2,other-directed-x,y,...,902.0,843.0,895.0,1114.0,,1316.0,3357.0,2077.0,right,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,45,experiment,63,21,2,self-directed,n,...,518.0,608.0,648.0,813.0,,1254.0,1335.0,2216.0,right,False
1306,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,46,experiment,96,29,2,other-directed,y,...,908.0,806.0,812.0,1006.0,,1622.0,797.0,1985.0,left,True
1307,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,521.0,573.0,521.0,512.0,810.0,1227.0,,16275.0,left,False
1308,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,48,experiment,75,24,2,self-directed,n,...,780.0,606.0,750.0,887.0,,1484.0,791.0,1782.0,right,False


In [138]:
# Print how many items I have for each condition
print(longform['mismatch'].value_counts())

mismatch
False    1110
True      200
Name: count, dtype: int64


## Choosing experiment

In [139]:
# Remove all rows where exp is 2 from trial_index
longform = longform[longform['exp'] != '2'].copy()
longform

# Reorder by custom condition order: practice, attention-check, exclusive, focus, contrastive
custom_order = ['practice', 'attention-check', 'exclusive', 'focus', 'contrastive']
existing = longform['condition'].dropna().astype(str).unique().tolist()
remaining = [c for c in existing if c not in custom_order]
full_order = custom_order + remaining

# Make ordered categorical and then sort by it (plus any secondary keys).
longform['condition'] = pd.Categorical(longform['condition'].astype(str), categories=full_order, ordered=True)
longform = longform.sort_values(by=['condition', 'participant_id', 'no'], na_position='last').reset_index(drop=True)
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,mismatch
0,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,6,practice,990,1,,practice,n,...,2251.0,1551.0,1366.0,1200.0,1379.0,1429.0,,2343.0,left,False
1,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,7,practice,991,2,,practice,y,...,1314.0,1077.0,901.0,901.0,922.0,900.0,,1227.0,right,False
2,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,8,practice,992,3,,practice,n,...,972.0,1032.0,870.0,815.0,1278.0,1240.0,2116.0,1256.0,left,False
3,2025-10-06 21:02:00+00:00,5b93d1913dca6000012c5fdc,c,6,practice,990,1,,practice,n,...,2455.0,1778.0,1791.0,1468.0,1839.0,1928.0,,2828.0,left,False
4,2025-10-06 21:02:00+00:00,5b93d1913dca6000012c5fdc,c,7,practice,991,2,,practice,y,...,1116.0,788.0,516.0,488.0,808.0,540.0,,2081.0,right,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,521.0,573.0,521.0,512.0,810.0,1227.0,,16275.0,left,False
746,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,49,experiment,21,7,1,contrastive,n,...,495.0,456.0,539.0,636.0,1055.0,884.0,,5009.0,right,False
747,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,43,experiment,30,10,1,contrastive,y,...,791.0,721.0,543.0,517.0,717.0,870.0,,6092.0,left,False
748,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,20,experiment,39,13,1,contrastive,n,...,693.0,803.0,1143.0,620.0,1227.0,909.0,,3445.0,left,True


## Expectations

In [140]:
import plotly.express as px

# Calculate mismatch ratio per condition
mismatch_counts = longform.groupby('condition', observed=True)['mismatch'].agg(['sum', 'count'])
mismatch_counts['mismatch_ratio'] = mismatch_counts['sum'] / mismatch_counts['count']

# Prepare data for donut chart
donut_data = []
for cond, row in mismatch_counts.iterrows():
    donut_data.append({'condition': cond, 'type': 'Mismatch', 'count': row['sum']})
    donut_data.append({'condition': cond, 'type': 'Match', 'count': row['count'] - row['sum']})

donut_df = pd.DataFrame(donut_data)

fig = px.pie(
    donut_df,
    names='type',
    values='count',
    color='type',
    facet_col='condition',
    hole=0.5,
    title='Expectation Mismatch Ratio per Condition',
    color_discrete_map={'Mismatch': '#bf616a', 'Match': '#4c566a'},
    width=1200, height=400,
)

# set template (or set pio.templates.default earlier)
fig.update_layout(template='nord_dark')
# fig.update_traces(textinfo='percent')

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

# Save as png and html
fig.write_image(plots / "mismatch_donut_per_condition.png", scale=3)
fig.write_html(plots / "mismatch_donut_per_condition.html", include_plotlyjs='cdn')

## Reading Times

In [141]:
# Replot: every item as an individual line, faceted by condition, color by participant
import plotly.express as px

# Rebuild melted if needed (one row per participant × item × region)
region_cols = [c for c in longform.columns if re.match(r"r\d+$", c)]
plot_df = longform.melt(
    id_vars=['participant_id', 'no', 'condition'],
    value_vars=region_cols,
    var_name='region',
    value_name='reading_time'
).dropna(subset=['reading_time'])

# Ensure region order
full_region_order = [f"r{i}" for i in range(1, 8)]
plot_df['region'] = pd.Categorical(plot_df['region'], categories=full_region_order, ordered=True)

# Create a unique line id per item (so each item is its own trace) and keep color = participant
plot_df['pid_item'] = plot_df['participant_id'].astype(str) + ' | no ' + plot_df['no'].astype(str)

fig = px.line(
    plot_df,
    x='region',
    y='reading_time',
    color='participant_id',        # color by participant
    line_group='pid_item',         # each item -> separate connected line
    facet_col='condition',
    # facet_col_wrap=2,              # <- two columns of facets
    category_orders={'region': full_region_order},
    markers=True,
    template='nord_dark',       
    title='Reading times per item (each item = one line), faceted by condition, colored by participant',
    labels={'reading_time': 'Reading time (ms)', 'region': 'Region'}
)

fig.update_traces(mode='lines+markers', marker={'size':4}, opacity=0.7, hovertemplate=None)
fig.update_layout(legend_title_text='Participant', width=1200, height=400)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # simplify facet labels
fig.show()

# Save image as png and html
fig.write_image(plots / "reading_times_raw.png", scale=3)
fig.write_html(plots / "reading_times_raw.html", include_plotlyjs='cdn')

In [142]:
# Remove practice and attention checks
longform = longform[longform['exp'].astype(str).eq('1')].copy()
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,mismatch
210,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,36,experiment,1,1,1,exclusive,n,...,532.0,556.0,696.0,584.0,756.0,906.0,,745.0,left,False
211,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,43,experiment,10,4,1,exclusive,y,...,697.0,574.0,559.0,585.0,620.0,641.0,,1028.0,right,False
212,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,49,experiment,19,7,1,exclusive,n,...,604.0,524.0,521.0,575.0,678.0,510.0,,831.0,left,False
213,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,45,experiment,28,10,1,exclusive,y,...,578.0,593.0,603.0,508.0,715.0,677.0,,807.0,right,False
214,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,38,experiment,37,13,1,exclusive,n,...,587.0,542.0,606.0,658.0,648.0,864.0,,902.0,left,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,521.0,573.0,521.0,512.0,810.0,1227.0,,16275.0,left,False
746,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,49,experiment,21,7,1,contrastive,n,...,495.0,456.0,539.0,636.0,1055.0,884.0,,5009.0,right,False
747,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,43,experiment,30,10,1,contrastive,y,...,791.0,721.0,543.0,517.0,717.0,870.0,,6092.0,left,False
748,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,20,experiment,39,13,1,contrastive,n,...,693.0,803.0,1143.0,620.0,1227.0,909.0,,3445.0,left,True


In [143]:
import plotly.express as px

# Melt longform to one row per participant × item × region 
region_cols = [c for c in longform.columns if re.match(r"r\d+$", c)]
melted = (
    longform
    .melt(id_vars=['participant_id', 'item', 'condition'],
          value_vars=region_cols,
          var_name='region',
          value_name='reading_time')
    .dropna(subset=['reading_time'])
)

# Per-participant (across items) mean and std for each region × condition
pp_region_stats = (
    melted
    .groupby(['participant_id', 'condition', 'region'], as_index=False, observed=True)
    .reading_time.agg(reading_time_mean='mean', reading_time_std='std')
)

# Per-participant question/choice mean+std across items (if present)
extras_pp = []
if 'question_rt' in longform.columns:
    q_pp = (
        longform
        .dropna(subset=['question_rt'])
        .groupby(['participant_id', 'condition'], as_index=False, observed=True)
        .question_rt.agg(reading_time_mean='mean', reading_time_std='std')
    )
    q_pp['region'] = 'question'
    extras_pp.append(q_pp[['participant_id','condition','region','reading_time_mean','reading_time_std']])

if 'choice_rt' in longform.columns:
    c_pp = (
        longform
        .dropna(subset=['choice_rt'])
        .groupby(['participant_id', 'condition'], as_index=False, observed=True)
        .choice_rt.agg(reading_time_mean='mean', reading_time_std='std')
    )
    c_pp['region'] = 'choice'
    extras_pp.append(c_pp[['participant_id','condition','region','reading_time_mean','reading_time_std']])

if extras_pp:
    pp_region_stats = pd.concat([pp_region_stats] + extras_pp, ignore_index=True, sort=False)

# Only show present conditions
present_conditions = pp_region_stats['condition'].dropna().unique().tolist()

# Now aggregate across participants: use participant-level means to compute group mean and across-participant std
present_regions = pp_region_stats['region'].unique().tolist()
full_region_order = [f"r{i}" for i in range(1, 8)]
present_regions = [r for r in full_region_order if r in present_regions] + [r for r in ['question','choice'] if r in pp_region_stats['region'].unique()]

agg_plot = (
    pp_region_stats
    .groupby(['region', 'condition'], as_index=False, observed=True)
    .reading_time_mean.agg(mean_reading_time='mean', std_reading_time='std')
)

# --- INSERT FALLBACK STD COMPUTATION HERE ---
std_items = melted.groupby(['condition', 'region'], observed=True)['reading_time'].std().reset_index().rename(columns={'reading_time':'std_items'})

if 'question_rt' in longform.columns:
    std_q = longform.dropna(subset=['question_rt']).groupby('condition', observed=True)['question_rt'].std().reset_index().rename(columns={'question_rt':'std_items'})
    std_q['region'] = 'question'
    std_items = pd.concat([std_items, std_q[['condition','region','std_items']]], ignore_index=True)

if 'choice_rt' in longform.columns:
    std_c = longform.dropna(subset=['choice_rt']).groupby('condition', observed=True)['choice_rt'].std().reset_index().rename(columns={'choice_rt':'std_items'})
    std_c['region'] = 'choice'
    std_items = pd.concat([std_items, std_c[['condition','region','std_items']]], ignore_index=True)

agg_plot = agg_plot.merge(std_items, on=['condition', 'region'], how='left')
agg_plot['std_reading_time'] = agg_plot['std_reading_time'].fillna(agg_plot['std_items'])
agg_plot.drop(columns=['std_items'], inplace=True)
# --- END INSERT ---

# Keep only present regions and set categorical ordering
agg_plot = agg_plot[agg_plot['region'].isin(present_regions) & agg_plot['condition'].isin(present_conditions)].copy()
agg_plot['region'] = pd.Categorical(agg_plot['region'], categories=present_regions, ordered=True)
agg_plot['condition'] = pd.Categorical(agg_plot['condition'], categories=present_conditions, ordered=True)

# Plot 1: Reading times for regions (r1..r7) — error bars are across-participant std of participant means
region_only = agg_plot[agg_plot['region'].isin(full_region_order)]
fig1 = px.line(
    region_only,
    x='region',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    facet_col='condition',
    category_orders={'region': full_region_order, 'condition': present_conditions},
    title='Mean Reading Times per Region<br>(Error Bars = Across-Participant Std Dev, Faceted by Condition)',
    labels={'mean_reading_time': 'Mean Reading Time (ms)', 'region': 'Region'}
)
fig1.update_traces(connectgaps=True, mode='lines+markers', marker={'size': 6}, line={'width': 2}, opacity=0.85)
fig1.update_yaxes(title_text='Mean reading time (ms)')
fig1.update_xaxes(title_text='Region')
fig1.update_layout(legend_title_text='Condition')
fig1.show()

# # Plot 2: Question RTs
# question_only = agg_plot[agg_plot['region'] == 'question']
# fig2 = px.bar(
#     question_only,
#     x='condition',
#     y='mean_reading_time',
#     error_y='std_reading_time',
#     color='condition',
#     category_orders={'condition': present_conditions},
#     title='Mean Question RT by Condition (Error Bars = Std Dev)',
#     labels={'mean_reading_time': 'Mean Question RT (ms)', 'condition': 'Condition'}
# )
# fig2.update_layout(legend_title_text='Condition')
# fig2.show()

# Plot 3: Decision RTs
choice_only = agg_plot[agg_plot['region'] == 'choice']
fig3 = px.bar(
    choice_only,
    x='condition',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    category_orders={'condition': present_conditions},
    title='Mean Decision RT by Condition (Error Bars = Std Dev)',
    labels={'mean_reading_time': 'Mean Decision RT (ms)', 'condition': 'Condition'}
)
fig3.update_layout(legend_title_text='Condition')
fig3.show()

In [144]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# Determine three_conditions (prefer canonical)
preferred = ['exclusive', 'focus', 'contrastive']
present_conditions = agg_plot['condition'].dropna().astype(str).unique().tolist()
three_conditions = [c for c in preferred if c in present_conditions]
if len(three_conditions) < 3:
    three_conditions = present_conditions[:3]

# Regions (exclude 'choice' from region facets)
full_region_order = [f"r{i}" for i in range(1, 8)]
region_order = full_region_order[:]  # r1..r7 only

# Data frames
region_df = agg_plot[agg_plot['region'].isin(region_order) & agg_plot['condition'].isin(three_conditions)].copy()
region_df['region'] = pd.Categorical(region_df['region'], categories=region_order, ordered=True)

choice_df = agg_plot[agg_plot['region'].astype(str) == 'choice']
choice_df = choice_df[choice_df['condition'].isin(three_conditions)].copy()

# Build color map for conditions so region traces and choice bars share colors
palette = nord
color_map = {cond: palette[i % len(palette)] for i, cond in enumerate(three_conditions)}

# --- Calculate global min/max for y axis from all region_df and choice_df mean_reading_time + std_reading_time values ---
all_y = pd.concat([
    region_df['mean_reading_time'] + region_df['std_reading_time'].fillna(0),
    choice_df['mean_reading_time'] + choice_df['std_reading_time'].fillna(0)
]).dropna()
ymin = float(pd.concat([
    region_df['mean_reading_time'] - region_df['std_reading_time'].fillna(0),
    choice_df['mean_reading_time'] - choice_df['std_reading_time'].fillna(0)
]).min()) if not all_y.isnull().all() else 0
ymax = float(all_y.max()) if not all_y.isnull().all() else 1
ypad = (ymax - ymin) * 0.15 if (ymax - ymin) > 0 else 10  # Increase padding for error bars
ymin = max(0, ymin - ypad)
ymax = ymax + ypad

# Build subplots: one column per condition + one final column for 'choice'
n_cols = len(three_conditions) + 1
subplot_titles = [f"{c}" for c in three_conditions] + ['choice']
fig = make_subplots(rows=1, cols=n_cols, shared_yaxes=True, subplot_titles=subplot_titles)

# Add region lines to each condition column (first len(three_conditions) columns)
for ci, cond in enumerate(three_conditions, start=1):
    d = region_df[region_df['condition'] == cond].sort_values('region')
    if d.empty:
        continue
    cond_color = color_map.get(cond)
    fig.add_trace(
        go.Scatter(
            x=d['region'].astype(str),
            y=d['mean_reading_time'],
            mode='lines+markers',
            name=cond,
            error_y=dict(type='data', array=d['std_reading_time'].values if 'std_reading_time' in d else None),
            marker=dict(size=6, color=cond_color),
            line=dict(width=2, color=cond_color)
        ),
        row=1, col=ci
    )
    # tidy axes
    fig.update_xaxes(categoryorder='array', categoryarray=region_order, row=1, col=ci)
    fig.update_yaxes(title_text='Mean reading time (ms)', row=1, col=ci, range=[ymin, ymax])

# Add 'choice' bars in the final column (one bar per condition) using same colors
choice_col = n_cols
if not choice_df.empty:
    # ensure order of conditions
    choice_df['condition'] = pd.Categorical(choice_df['condition'].astype(str), categories=three_conditions, ordered=True)
    for cond in three_conditions:
        d = choice_df[choice_df['condition'] == cond]
        if d.empty:
            continue
        cond_color = color_map.get(cond)
        fig.add_trace(
            go.Bar(
                x=[cond],
                y=[d['mean_reading_time'].iloc[0]],
                name=cond,
                marker_color=cond_color,
                error_y=dict(type='data', array=[d['std_reading_time'].iloc[0]] if 'std_reading_time' in d else None, color=snow[2]),
            ),
            row=1, col=choice_col
        )
    fig.update_xaxes(title_text='Condition', row=1, col=choice_col)
    fig.update_yaxes(title_text='Mean reading time (ms)', row=1, col=choice_col, range=[ymin, ymax])

# Layout
fig.update_layout(
    template='nord_dark',
    showlegend=False,
    title='Mean RTs: word regions and choices',
    width=1200, height=400,
)

# Ensure y-axis labels/ticks have enough room to avoid overlap
# - automargin lets Plotly expand left margin when ticklabels are long
# - title_standoff moves the y-axis title away from tick labels
fig.update_yaxes(automargin=True, title_standoff=20)

# Set a reasonable base left margin (automargin will expand further if needed)
fig.update_layout(margin=dict(l=110, r=20, t=80, b=60))

fig.show()

# Save as png and html
fig.write_image(plots / "reading_times.png", scale=3)
fig.write_html(plots / "reading_times.html", include_plotlyjs='cdn')

## Choices

In [145]:
# Calculate, for each condition, the ratio of choices for 'a' type and 'b' type sides
def get_side_type(value):
    # Returns 'a' if value ends with 'a', 'b' if ends with 'b', else None
    if isinstance(value, str):
        if value.endswith('a'):
            return 'a'
        elif value.endswith('b'):
            return 'b'
    return None

# For each row, determine the type ('a' or 'b') of the chosen side
def chosen_side_type(row):
    if row['choice'] == 'left':
        return get_side_type(row['left'])
    elif row['choice'] == 'right':
        return get_side_type(row['right'])
    return None

longform['chosen_type'] = longform.apply(chosen_side_type, axis=1)

# Group by condition and count choices for each type
type_counts = (
    longform
    .groupby(['condition', 'chosen_type'])
    .size()
    .unstack(fill_value=0)
)

# Calculate ratio for each type per condition
type_ratios = type_counts.div(type_counts.sum(axis=1), axis=0)

print("Counts per condition:")
print(type_counts)
print("\nRatios per condition:")
print(type_ratios)

Counts per condition:
chosen_type        a    b
condition                
practice           0    0
attention-check    0    0
exclusive        169   11
focus            170   10
contrastive       75  105

Ratios per condition:
chosen_type             a         b
condition                          
practice              NaN       NaN
attention-check       NaN       NaN
exclusive        0.938889  0.061111
focus            0.944444  0.055556
contrastive      0.416667  0.583333






In [146]:
import plotly.express as px

# Prepare data for donut chart: one donut per condition, split by chosen_type
donut_data = []
for cond in type_counts.index:
    for chosen_type in type_counts.columns:
        count = type_counts.loc[cond, chosen_type]
        donut_data.append({
            'condition': cond,
            'chosen_type': chosen_type if pd.notnull(chosen_type) else 'None',
            'count': count
        })

donut_df = pd.DataFrame(donut_data)

# Filter out 'practice' and 'attention-check' conditions before plotting
exclude_conditions = ['practice', 'attention-check']
plot_conditions = [c for c in type_counts.index if c not in exclude_conditions]
filtered_donut_df = donut_df[donut_df['condition'].isin(plot_conditions)]

fig = px.pie(
    filtered_donut_df,
    names='chosen_type',
    values='count',
    color='chosen_type',
    facet_col='condition',
    hole=0.5,
    title='Choice Ratios of Exclusive (a) or Contrastive (b) types of images per Condition',
    color_discrete_map={'a': '#5e81ac', 'b': '#a3be8c', 'None': '#4c566a'}
)

fig.update_layout(template='nord_dark', width=1200, height=400)
fig.update_traces(textinfo='percent+label')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

# Save as png and html
fig.write_image(plots / "choice_type_donut_per_condition.png", width=1200, height=400)
fig.write_html(plots / "choice_type_donut_per_condition.html", include_plotlyjs='cdn')

## Eye-tracking

In [147]:
# # Download and save eyetracking data files per participant
# def download_and_save_eyetracking_data():
#     out_dir = 'eyetracking_data'
#     eturl = "https://mondo1.dreamhosters.com/script.php?experiment="
#     for _, row in participants_df.iterrows():
#         et_filename = row['eyetracker_filename']
#         participant_id = row['participant_id']
#         if pd.notnull(et_filename) and pd.notnull(participant_id):
#             et_file = eturl + et_filename
#             try:
#                 r = requests.get(et_file, timeout=15)
#                 r.raise_for_status()
#                 df_et = pd.read_csv(StringIO(r.text))
#                 df_et.to_csv(f"{out_dir}/{participant_id}.csv", index=False)
#             except Exception as e:
#                 print(f"Failed for {participant_id}: {e}")
#     print("Done!")
# # Example usage:
# download_and_save_eyetracking_data()

In [148]:
# # Fix eyetracking csv trial numbering
# et_dir = os.path.join(ROOT, 'eyetracking_data')

# for csv_file in glob(os.path.join(et_dir, '*.csv')):
#     df = pd.read_csv(csv_file)
#     # Apply fix_trial_numbering WITHOUT changing row order
#     trial_col = 'trial' if 'trial' in df.columns else [c for c in df.columns if 'trial' in c][0]
#     # Map 9,10,11 to 6,7,8
#     mapping = {9: 6, 10: 7, 11: 8}
#     df[trial_col] = df[trial_col].replace(mapping)
#     # Find the index of the last occurrence of 8
#     last_mapped_idx = df[df[trial_col] == 8].index.max()
#     if last_mapped_idx is not None and last_mapped_idx + 1 < len(df):
#         next_trial = 10
#         prev_trial_val = None
#         for i in range(last_mapped_idx + 1, len(df)):
#             current_trial_val = df.at[i, trial_col]
#             if prev_trial_val is not None and current_trial_val != prev_trial_val:
#                 next_trial += 1
#             df.at[i, trial_col] = next_trial
#             prev_trial_val = current_trial_val
#     df.to_csv(csv_file, index=False)
# print("All eyetracking files fixed and overwritten.")

## Parse ET

In [149]:
# # Eye tracking data with filtering out neutral gaze

# # Directory containing eyetracking data
# et_dir = os.path.join(ROOT, 'eyetracking_data')

# # Helper to parse a single participant's eyetracking file
# def parse_eyetracking_file(filepath):
#     df = pd.read_csv(filepath)
#     # df = fix_trial_numbering(df).copy()
#     df.columns = [c.lower() for c in df.columns]
#     # Drop rows where both left and right canvas are 0 (neutral gaze) 
#     # df = df[~((df['_left_canvas'] == 0) & (df['_right_canvas'] == 0))].copy()
#     # Identify trial number
#     if 'trial' in df.columns:
#         trial_col = 'trial'
#     else:
#         trial_col = [c for c in df.columns if 'trial' in c][0]
#     return df, trial_col

# # Aggregate results for all participants except those to remove
# et_results = []
# for csv_file in glob(os.path.join(et_dir, '*.csv')):
#     participant_id = os.path.splitext(os.path.basename(csv_file))[0]
#     if participant_id in list_of_participants_to_remove:
#         continue
#     df, trial_col = parse_eyetracking_file(csv_file)
#     for trial, g in df.groupby(trial_col):
#         dwell_left = g.loc[g['_left_canvas'] == 1, 'times'].max() / 1000
#         dwell_right = g.loc[g['_right_canvas'] == 1, 'times'].max() / 1000
#         total_dwell = dwell_left + dwell_right
#         prop_left = dwell_left / total_dwell if total_dwell > 0 else np.nan
#         prop_right = dwell_right / total_dwell if total_dwell > 0 else np.nan
#         dominant = 'left' if dwell_left > dwell_right else 'right'
#         left_changes = (g['_left_canvas'] != g['_left_canvas'].shift()).astype(int)
#         n_fix_left = ((g['_left_canvas'] == 1) & (left_changes == 1)).sum()
#         right_changes = (g['_right_canvas'] != g['_right_canvas'].shift()).astype(int)
#         n_fix_right = ((g['_right_canvas'] == 1) & (right_changes == 1)).sum()
#         canvas_state = np.where(g['_left_canvas'] == 1, 'L', np.where(g['_right_canvas'] == 1, 'R', 'N'))
#         transitions = np.sum(canvas_state[1:] != canvas_state[:-1])

#         # Fixation counts per AOI and revisits (returns to AOI after leaving) ---
#         states = canvas_state
#         # indices where state changes => segment boundaries
#         change_idx = np.where(states[1:] != states[:-1])[0] + 1
#         segment_bounds = np.concatenate(([0], change_idx, [len(states)]))
#         fixations_left = 0
#         fixations_right = 0
#         first_fix_idx = None
#         for s_start, s_end in zip(segment_bounds[:-1], segment_bounds[1:]):
#             seg_state = states[s_start]
#             if seg_state == 'L':
#                 fixations_left += 1
#                 if first_fix_idx is None:
#                     first_fix_idx = s_start
#             elif seg_state == 'R':
#                 fixations_right += 1
#                 if first_fix_idx is None:
#                     first_fix_idx = s_start
#         # time to first fixation (seconds) relative to trial start (first timestamp in g)
#         if first_fix_idx is not None and len(g) > 0:
#             try:
#                 first_fix_time = g['times'].values[first_fix_idx]
#                 trial_start_time = g['times'].values[0]
#                 tff = (first_fix_time - trial_start_time) / 1000.0
#             except Exception:
#                 tff = np.nan
#             ffl = 'left' if states[first_fix_idx] == 'L' else 'right' if states[first_fix_idx] == 'R' else None
#         else:
#             tff = np.nan
#             ffl = None

#         # revisits = number of returns to an AOI after leaving it = max(0, fixations - 1)
#         revisits_left = max(0, fixations_left - 1)
#         revisits_right = max(0, fixations_right - 1)

#         et_results.append({
#             'participant_id': participant_id,
#             'trial': trial,
#             'dominant': dominant,
#             'prop_left': round(prop_left, 2) if not np.isnan(prop_left) else np.nan,
#             'prop_right': round(prop_right, 2) if not np.isnan(prop_right) else np.nan,
#             'total_dwell': round(total_dwell, 2),
#             'dwell_left': dwell_left,
#             'dwell_right': dwell_right,
#             'n_fix_left': n_fix_left,
#             'n_fix_right': n_fix_right,
#             'transitions': transitions,
#             # added metrics
#             'tff': tff,                       # time to first fixation (s)
#             'ffl': ffl,                       # first fixation location: 'left' or 'right' (or None)
#             'fixations_left': fixations_left, # number of discrete left-AOI fixations
#             'fixations_right': fixations_right,# number of discrete right-AOI fixations
#             'fixation_count_total': fixations_left + fixations_right,
#             'revisits_left': revisits_left,   # returns to left after leaving
#             'revisits_right': revisits_right, # returns to right after leaving
#             'revisits_total': revisits_left + revisits_right
#         })
# et_df = pd.DataFrame(et_results)

# # Remove certain participants
# et_df = et_df[~et_df['participant_id'].isin(list_of_participants_to_remove)]
# et_df

In [150]:
# Eye tracking data
et_dir = os.path.join(ROOT, 'eyetracking_data')

def parse_eyetracking_file(filepath):
    df = pd.read_csv(filepath)  # read one participant CSV
    df.columns = [c.lower() for c in df.columns]  # normalize column names
    trial_col = 'trial' if 'trial' in df.columns else [c for c in df.columns if 'trial' in c][0]  # find trial column
    if trial_col != 'trial':
        df = df.rename(columns={trial_col: 'trial'})  # ensure consistent column name
    return df

def load_eyetracking_directory(et_directory, skip_ids=None):
    records = []  # collected per-participant frames
    skip_ids = set(skip_ids or [])  # convert skip list to set
    for csv_file in glob(os.path.join(et_directory, '*.csv')):
        participant_id = os.path.splitext(os.path.basename(csv_file))[0]  # derive participant id from filename
        if participant_id in skip_ids:
            continue  # skip flagged participants
        df = parse_eyetracking_file(csv_file)  # load participant samples
        df['participant_id'] = participant_id  # keep participant id in rows
        records.append(df)
    if not records:
        return pd.DataFrame()  # handle empty directory case
    return pd.concat(records, ignore_index=True)  # combine all samples

raw_et = load_eyetracking_directory(et_dir, list_of_participants_to_remove)  # load all usable samples

if 'unnamed: 4' in raw_et.columns:
    raw_et = raw_et.drop(columns=['unnamed: 4'])  # drop stray export column
raw_et

Unnamed: 0,trial,times,_left_canvas,_right_canvas,participant_id
0,6,0,0,1,59679c319febf80001d53655
1,6,107,0,0,59679c319febf80001d53655
2,6,176,0,0,59679c319febf80001d53655
3,6,226,0,1,59679c319febf80001d53655
4,6,272,0,0,59679c319febf80001d53655
...,...,...,...,...,...
57748,49,4815,0,1,67698e94c727a4a942390c57
57749,49,4853,0,1,67698e94c727a4a942390c57
57750,49,4893,0,1,67698e94c727a4a942390c57
57751,49,4927,0,1,67698e94c727a4a942390c57


In [151]:
# Print raw_et head (10)
print(raw_et.head(10))

   trial  times  _left_canvas  _right_canvas            participant_id
0      6      0             0              1  59679c319febf80001d53655
1      6    107             0              0  59679c319febf80001d53655
2      6    176             0              0  59679c319febf80001d53655
3      6    226             0              1  59679c319febf80001d53655
4      6    272             0              0  59679c319febf80001d53655
5      6    344             0              0  59679c319febf80001d53655
6      6    397             0              1  59679c319febf80001d53655
7      6    444             0              0  59679c319febf80001d53655
8      6    490             1              0  59679c319febf80001d53655
9      6    538             1              0  59679c319febf80001d53655


## IDT Classifier

In [152]:
# Compute per-sample durations from timestamp series
def compute_sample_durations(times: pd.Series) -> np.ndarray:
    """
    Given a pandas Series of timestamps (in milliseconds), compute the duration
    of each sample as the difference between consecutive timestamps, with the first
    sample's duration set to the same as the second sample's duration. Negative
    durations (due to timestamp irregularities) are clipped to zero.
    Args:
        times (pd.Series): Series of timestamps in milliseconds.
    Returns:
        np.ndarray: Array of sample durations in milliseconds.
    """
    # Convert to numpy array of floats
    arr = times.to_numpy(dtype=float)
    # Handle empty series
    if arr.size == 0:
        return np.array([], dtype=float)
    # Handle single-sample series
    if arr.size == 1:
        return np.array([0.0], dtype=float)
    # Compute differences with first value prepended
    diffs = np.diff(arr, prepend=arr[0])
    # Copy second diff into first slot
    diffs[0] = diffs[1]
    # Clip negatives to zero
    return np.clip(diffs, a_min=0.0, a_max=None)

# Detect fixations via I-DT algorithm and label samples
def detect_fixations_idt(et_samples: pd.DataFrame,
                         min_duration_ms: float,
                         dispersion_threshold: float = 0.5) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Detect fixations in eyetracking samples using the I-DT algorithm.
    Args:
        et_samples (pd.DataFrame): DataFrame containing eyetracking samples with columns:
            - 'participant_id': Identifier for participant
            - 'trial': Trial number
            - 'times': Timestamp in milliseconds
            - '_left_canvas': Binary indicator for left AOI
            - '_right_canvas': Binary indicator for right AOI
        min_duration_ms (float): Minimum duration (ms) for a fixation.
        dispersion_threshold (float): Maximum dispersion threshold for a fixation.
    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: 
            - DataFrame of samples with 'clean_state' column labeled as 'left', 'right', or 'saccade'.
            - DataFrame summarizing detected fixations with columns:
                'participant_id', 'trial', 'AOI', 'start_time', 'end_time', 'duration_ms'.
    """
    # Copy input to avoid mutation
    samples = et_samples.copy()
    # Initialize clean_state column
    samples['clean_state'] = 'saccade'
    # Collect fixation summaries
    fixations = []
    # Process data per participant and trial
    for (pid, trial), trial_df in samples.groupby(['participant_id', 'trial']):
        # Sort by time and keep original indices
        g = trial_df.sort_values('times').copy()
        g['orig_index'] = g.index
        # Map AOIs to numeric positions
        g['x'] = np.where(g['_left_canvas'] == 1, 0.0,
                          np.where(g['_right_canvas'] == 1, 1.0, np.nan))
        # Drop rows without AOI and reset index
        g = g.dropna(subset=['x']).reset_index(drop=True)
        # Skip trials without usable samples
        if g.empty:
            continue
        # Precompute sample durations
        g['duration_ms'] = compute_sample_durations(g['times'])
        # Initialize window start pointer
        start = 0
        # Slide window across samples
        while start < len(g):
            # Initialize window end pointer
            end = start
            # Track cumulative window duration
            window_dur = 0.0
            # Grow window until minimum duration reached
            while end < len(g) and window_dur < min_duration_ms:
                window_dur += g.loc[end, 'duration_ms']
                end += 1
            # Break if window extends past data
            if end > len(g):
                break
            # Slice current window
            window = g.iloc[start:end]
            # Measure positional dispersion
            dispersion = window['x'].max() - window['x'].min()
            # Check dispersion threshold
            if dispersion <= dispersion_threshold:
                # Try to extend window while dispersion stays low
                while end < len(g):
                    candidate = g.iloc[start:end+1]
                    if (candidate['x'].max() - candidate['x'].min()) > dispersion_threshold:
                        break
                    window_dur += g.loc[end, 'duration_ms']
                    end += 1
                # Final fixation slice
                fixation = g.iloc[start:end]
                # Determine dominant AOI
                aoi = 'left' if fixation['_left_canvas'].sum() > fixation['_right_canvas'].sum() else 'right'
                # Mark samples belonging to fixation
                samples.loc[fixation['orig_index'], 'clean_state'] = aoi
                # Store fixation summary
                fixations.append({
                    'participant_id': pid,
                    'trial': trial,
                    'AOI': aoi,
                    'start_time': fixation['times'].iloc[0],
                    'end_time': fixation['times'].iloc[-1],
                    'duration_ms': fixation['duration_ms'].sum()
                })
                # Move start pointer to end of fixation
                start = end
            else:
                # Advance start pointer to search next window
                start += 1
    # Return labeled samples and fixation dataframe
    return samples, pd.DataFrame(fixations)

# Run fixation detection on raw samples
clean_et_samples, idt_fixations = detect_fixations_idt(raw_et, min_duration_ms=100)

In [153]:
clean_et_samples

Unnamed: 0,trial,times,_left_canvas,_right_canvas,participant_id,clean_state
0,6,0,0,1,59679c319febf80001d53655,right
1,6,107,0,0,59679c319febf80001d53655,saccade
2,6,176,0,0,59679c319febf80001d53655,saccade
3,6,226,0,1,59679c319febf80001d53655,right
4,6,272,0,0,59679c319febf80001d53655,saccade
...,...,...,...,...,...,...
57748,49,4815,0,1,67698e94c727a4a942390c57,right
57749,49,4853,0,1,67698e94c727a4a942390c57,right
57750,49,4893,0,1,67698e94c727a4a942390c57,right
57751,49,4927,0,1,67698e94c727a4a942390c57,right


In [154]:
idt_fixations

Unnamed: 0,participant_id,trial,AOI,start_time,end_time,duration_ms
0,59679c319febf80001d53655,6,right,0,397,623.0
1,59679c319febf80001d53655,6,left,490,1607,1210.0
2,59679c319febf80001d53655,6,right,1707,1707,100.0
3,59679c319febf80001d53655,6,left,1755,2231,524.0
4,59679c319febf80001d53655,7,right,187,1071,931.0
...,...,...,...,...,...,...
3999,67698e94c727a4a942390c57,48,right,1797,2731,978.0
4000,67698e94c727a4a942390c57,49,right,176,176,107.0
4001,67698e94c727a4a942390c57,49,right,263,604,392.0
4002,67698e94c727a4a942390c57,49,left,655,1822,1218.0


In [155]:
print(idt_fixations.head(10))

             participant_id  trial    AOI  start_time  end_time  duration_ms
0  59679c319febf80001d53655      6  right           0       397        623.0
1  59679c319febf80001d53655      6   left         490      1607       1210.0
2  59679c319febf80001d53655      6  right        1707      1707        100.0
3  59679c319febf80001d53655      6   left        1755      2231        524.0
4  59679c319febf80001d53655      7  right         187      1071        931.0
5  59679c319febf80001d53655      8  right           0       438        534.0
6  59679c319febf80001d53655      8   left         490      1187        749.0
7  59679c319febf80001d53655     10  right           0       620        664.0
8  59679c319febf80001d53655     10   left         745      1072        452.0
9  59679c319febf80001d53655     10  right        1121      1886        814.0


In [156]:
# df = raw_et.copy()

# # Create AOI label
# def get_aoi(row):
#     if row['_left_canvas'] == 1:
#         return 'left'
#     elif row['_right_canvas'] == 1:
#         return 'right'
#     else:
#         return 'none'

# df['AOI'] = df.apply(get_aoi, axis=1)

# # Set minimum fixation duration (e.g., 100 ms)
# MIN_DURATION = 100  # milliseconds

# # Group into AOI episodes
# fixations = []
# start_idx = 0

# for i in range(1, len(df)):
#     if df.loc[i, 'AOI'] != df.loc[start_idx, 'AOI']:
#         duration = df.loc[i - 1, 'times'] - df.loc[start_idx, 'times']
#         if duration >= MIN_DURATION:
#             fixations.append({
#                 'start_time': df.loc[start_idx, 'times'],
#                 'end_time': df.loc[i - 1, 'times'],
#                 'AOI': df.loc[start_idx, 'AOI']
#             })
#         start_idx = i

# # Handle last fixation
# duration = df.loc[len(df) - 1, 'times'] - df.loc[start_idx, 'times']
# if duration >= MIN_DURATION:
#     fixations.append({
#         'start_time': df.loc[start_idx, 'times'],
#         'end_time': df.loc[len(df) - 1, 'times'],
#         'AOI': df.loc[start_idx, 'AOI']
#     })

# # Convert to DataFrame
# fixation_df = pd.DataFrame(fixations)

# # Count transitions
# transitions = (fixation_df['AOI'] != fixation_df['AOI'].shift()).sum() - 1
# print("Number of transitions:", transitions)

## Analysis

In [157]:
# # Eye tracking metric aggregation
# def summarize_eyetracking(et_samples: pd.DataFrame) -> pd.DataFrame:
#     if et_samples.empty:
#         return pd.DataFrame(columns=[
#             'participant_id', 'trial', 'dominant', 'prop_left', 'prop_right', 'total_dwell',
#             'dwell_left', 'dwell_right', 'n_fix_left', 'n_fix_right', 'transitions',
#             'tff', 'ffl', 'fixations_left', 'fixations_right', 'fixation_count_total',
#             'revisits_left', 'revisits_right', 'revisits_total'
#         ])
#     et_results = []
#     for (participant_id, trial), g in et_samples.groupby(['participant_id', 'trial']):
#         g = g.sort_values('times')
#         dwell_left = g.loc[g['_left_canvas'] == 1, 'times'].max() / 1000 if (g['_left_canvas'] == 1).any() else 0
#         dwell_right = g.loc[g['_right_canvas'] == 1, 'times'].max() / 1000 if (g['_right_canvas'] == 1).any() else 0
#         total_dwell = dwell_left + dwell_right
#         prop_left = dwell_left / total_dwell if total_dwell > 0 else np.nan
#         prop_right = dwell_right / total_dwell if total_dwell > 0 else np.nan
#         dominant = 'left' if dwell_left > dwell_right else 'right'
#         left_changes = (g['_left_canvas'] != g['_left_canvas'].shift()).astype(int)
#         n_fix_left = ((g['_left_canvas'] == 1) & (left_changes == 1)).sum()
#         right_changes = (g['_right_canvas'] != g['_right_canvas'].shift()).astype(int)
#         n_fix_right = ((g['_right_canvas'] == 1) & (right_changes == 1)).sum()
#         canvas_state = np.where(g['_left_canvas'] == 1, 'L', np.where(g['_right_canvas'] == 1, 'R', 'N'))
#         transitions = np.sum(canvas_state[1:] != canvas_state[:-1])

#         change_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
#         segment_bounds = np.concatenate(([0], change_idx, [len(canvas_state)]))
#         fixations_left = 0
#         fixations_right = 0
#         first_fix_idx = None
#         for s_start, s_end in zip(segment_bounds[:-1], segment_bounds[1:]):
#             seg_state = canvas_state[s_start]
#             if seg_state == 'L':
#                 fixations_left += 1
#                 if first_fix_idx is None:
#                     first_fix_idx = s_start
#             elif seg_state == 'R':
#                 fixations_right += 1
#                 if first_fix_idx is None:
#                     first_fix_idx = s_start

#         if first_fix_idx is not None and len(g) > 0:
#             try:
#                 first_fix_time = g['times'].values[first_fix_idx]
#                 trial_start_time = g['times'].values[0]
#                 tff = (first_fix_time - trial_start_time) / 1000.0
#             except Exception:
#                 tff = np.nan
#             ffl = 'left' if canvas_state[first_fix_idx] == 'L' else 'right' if canvas_state[first_fix_idx] == 'R' else None
#         else:
#             tff = np.nan
#             ffl = None

#         revisits_left = max(0, fixations_left - 1)
#         revisits_right = max(0, fixations_right - 1)

#         et_results.append({
#             'participant_id': participant_id,
#             'trial': trial,
#             'dominant': dominant,
#             'prop_left': round(prop_left, 2) if not np.isnan(prop_left) else np.nan,
#             'prop_right': round(prop_right, 2) if not np.isnan(prop_right) else np.nan,
#             'total_dwell': round(total_dwell, 2),
#             'dwell_left': dwell_left,
#             'dwell_right': dwell_right,
#             'n_fix_left': n_fix_left,
#             'n_fix_right': n_fix_right,
#             'transitions': transitions,
#             'tff': tff,
#             'ffl': ffl,
#             'fixations_left': fixations_left,
#             'fixations_right': fixations_right,
#             'fixation_count_total': fixations_left + fixations_right,
#             'revisits_left': revisits_left,
#             'revisits_right': revisits_right,
#             'revisits_total': revisits_left + revisits_right
#         })
#     return pd.DataFrame(et_results)

# et_df = summarize_eyetracking(clean_et_samples) # raw_et
# et_df = et_df[~et_df['participant_id'].isin(list_of_participants_to_remove)]
# et_df

In [158]:
# Eye tracking metric aggregation based on fixation summary
def summarize_eyetracking(fixations: pd.DataFrame) -> pd.DataFrame:
    required_cols = {'participant_id', 'trial', 'AOI', 'start_time', 'end_time', 'duration_ms'}
    if fixations.empty:
        return pd.DataFrame([
            {
                'participant_id': pd.NA, 'trial': pd.NA, 'dominant': pd.NA,
                'prop_left': np.nan, 'prop_right': np.nan, 'total_dwell': np.nan,
                'dwell_left': np.nan, 'dwell_right': np.nan, 'n_fix_left': 0,
                'n_fix_right': 0, 'transitions': 0, 'tff': np.nan, 'ffl': pd.NA,
                'fixations_left': 0, 'fixations_right': 0, 'fixation_count_total': 0,
                'revisits_left': 0, 'revisits_right': 0, 'revisits_total': 0
            }
        ]).iloc[0:0]
    missing = required_cols - set(fixations.columns)
    if missing:
        raise ValueError(f"Missing required fixation columns: {sorted(missing)}")

    fix = (
        fixations
        .dropna(subset=['participant_id', 'trial'])
        .copy()
    )
    fix['duration_ms'] = pd.to_numeric(fix['duration_ms'], errors='coerce').fillna(0.0)
    fix['start_time'] = pd.to_numeric(fix['start_time'], errors='coerce')
    fix['end_time'] = pd.to_numeric(fix['end_time'], errors='coerce')
    fix = fix.dropna(subset=['start_time', 'end_time'])
    fix = fix.sort_values(['participant_id', 'trial', 'start_time'])

    metrics = []
    for (participant_id, trial), grp in fix.groupby(['participant_id', 'trial'], sort=True):
        durations = grp['duration_ms']
        dwell_left_sec = float(durations[grp['AOI'] == 'left'].sum()) / 1000.0
        dwell_right_sec = float(durations[grp['AOI'] == 'right'].sum()) / 1000.0
        total_dwell_sec = dwell_left_sec + dwell_right_sec

        if total_dwell_sec > 0:
            prop_left = dwell_left_sec / total_dwell_sec
            prop_right = dwell_right_sec / total_dwell_sec
            if dwell_left_sec > dwell_right_sec:
                dominant = 'left'
            elif dwell_right_sec > dwell_left_sec:
                dominant = 'right'
            else:
                dominant = 'left'
        else:
            prop_left = np.nan
            prop_right = np.nan
            dominant = None

        aoi_series = grp['AOI'].astype(str)
        transitions = int(max((aoi_series != aoi_series.shift()).sum() - 1, 0))
        fixations_left = int((aoi_series == 'left').sum())
        fixations_right = int((aoi_series == 'right').sum())

        first_fix_start_ms = float(grp['start_time'].iloc[0]) if len(grp) else np.nan
        trial_baseline_ms = float(grp['start_time'].min()) if len(grp) else np.nan
        if np.isnan(first_fix_start_ms):
            tff_sec = np.nan
        elif np.isnan(trial_baseline_ms):
            tff_sec = first_fix_start_ms / 1000.0
        else:
            tff_sec = (first_fix_start_ms - trial_baseline_ms) / 1000.0

        first_fix_location = aoi_series.iloc[0] if len(aoi_series) else None
        revisits_left = max(0, fixations_left - 1)
        revisits_right = max(0, fixations_right - 1)

        metrics.append({
            'participant_id': participant_id,
            'trial': trial,
            'dominant': dominant,
            'prop_left': round(prop_left, 2) if not np.isnan(prop_left) else np.nan,
            'prop_right': round(prop_right, 2) if not np.isnan(prop_right) else np.nan,
            'total_dwell': round(total_dwell_sec, 2),
            'dwell_left': round(dwell_left_sec, 2),
            'dwell_right': round(dwell_right_sec, 2),
            'n_fix_left': fixations_left,
            'n_fix_right': fixations_right,
            'transitions': transitions,
            'tff': tff_sec,
            'ffl': first_fix_location,
            'fixations_left': fixations_left,
            'fixations_right': fixations_right,
            'fixation_count_total': fixations_left + fixations_right,
            'revisits_left': revisits_left,
            'revisits_right': revisits_right,
            'revisits_total': revisits_left + revisits_right
        })

    return pd.DataFrame(metrics)

et_df = summarize_eyetracking(idt_fixations) # raw_et or idt_fixations
et_df = et_df[~et_df['participant_id'].isin(list_of_participants_to_remove)]
et_df

Unnamed: 0,participant_id,trial,dominant,prop_left,prop_right,total_dwell,dwell_left,dwell_right,n_fix_left,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
0,59679c319febf80001d53655,6,left,0.71,0.29,2.46,1.73,0.72,2,2,3,0.0,right,2,2,4,1,1,2
1,59679c319febf80001d53655,7,right,0.00,1.00,0.93,0.00,0.93,0,1,0,0.0,right,0,1,1,0,0,0
2,59679c319febf80001d53655,8,left,0.58,0.42,1.28,0.75,0.53,1,1,1,0.0,right,1,1,2,0,0,0
3,59679c319febf80001d53655,10,right,0.35,0.65,2.27,0.79,1.48,2,2,3,0.0,right,2,2,4,1,1,2
4,59679c319febf80001d53655,11,right,0.35,0.65,2.99,1.04,1.95,2,4,4,0.0,right,2,4,6,1,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,67698e94c727a4a942390c57,45,right,0.37,0.63,3.14,1.17,1.97,4,3,4,0.0,left,4,3,7,3,2,5
1302,67698e94c727a4a942390c57,46,left,0.79,0.21,2.85,2.25,0.60,2,1,2,0.0,left,2,1,3,1,0,1
1303,67698e94c727a4a942390c57,47,left,0.70,0.30,16.25,11.36,4.89,5,4,8,0.0,left,5,4,9,4,3,7
1304,67698e94c727a4a942390c57,48,right,0.11,0.89,2.67,0.29,2.38,1,4,2,0.0,right,1,4,5,0,3,3


In [159]:
# clean_threshold_ms = 100  # minimum fixation duration (ms) to keep after cleaning short gaze episodes

# def compute_sample_durations(times: pd.Series) -> np.ndarray:
#     """Return per-sample dwell durations (ms) derived from time stamps."""
#     arr = times.to_numpy(dtype=float)
#     if arr.size == 0:
#         return np.array([], dtype=float)
#     if arr.size == 1:
#         return np.array([0.0], dtype=float)
#     diffs = np.diff(arr)
#     diffs = np.clip(diffs, a_min=0.0, a_max=None)
#     positive = diffs[diffs > 0]
#     fallback = float(np.median(positive)) if positive.size else 0.0
#     sample_durations = np.concatenate([diffs, [fallback]])
#     return sample_durations.astype(float)

# def merge_short_episodes(states: np.ndarray, sample_durations: np.ndarray, threshold_ms: float) -> np.ndarray:
#     """Merge or reassign gaze episodes shorter than threshold_ms following the cleaning recommendations."""
#     cleaned = states.astype(object).copy()
#     if cleaned.size == 0:
#         return cleaned
#     threshold_ms = float(threshold_ms)
#     while True:
#         change_idx = np.where(cleaned[1:] != cleaned[:-1])[0] + 1
#         bounds = np.concatenate(([0], change_idx, [len(cleaned)]))
#         durations = [sample_durations[start:end].sum() for start, end in zip(bounds[:-1], bounds[1:])]
#         changed = False
#         for idx, duration in enumerate(durations):
#             if duration >= threshold_ms:
#                 continue
#             start, end = bounds[idx], bounds[idx + 1]
#             prev_idx = idx - 1 if idx > 0 else None
#             next_idx = idx + 1 if idx + 1 < len(durations) else None
#             merge_state = None
#             if prev_idx is not None and next_idx is not None and cleaned[bounds[prev_idx]] == cleaned[bounds[next_idx]]:
#                 merge_state = cleaned[bounds[prev_idx]]
#             elif prev_idx is not None and next_idx is not None:
#                 merge_state = cleaned[bounds[prev_idx]] if durations[prev_idx] >= durations[next_idx] else cleaned[bounds[next_idx]]
#             elif prev_idx is not None:
#                 merge_state = cleaned[bounds[prev_idx]]
#             elif next_idx is not None:
#                 merge_state = cleaned[bounds[next_idx]]
#             if merge_state is None:
#                 continue
#             cleaned[start:end] = merge_state
#             changed = True
#             break
#         if not changed:
#             break
#     return cleaned

# def clean_gaze_segments(gaze_df: pd.DataFrame, threshold_ms: float) -> pd.DataFrame:
#     """Return gaze_df sorted with cleaned AOI assignments and sample durations."""
#     if gaze_df.empty:
#         return gaze_df.copy()
#     g_sorted = gaze_df.sort_values('times').reset_index(drop=True)
#     left = g_sorted['_left_canvas'].to_numpy()
#     right = g_sorted['_right_canvas'].to_numpy()
#     raw_states = np.where(
#         (left == 1) & (right == 0), 'L',
#         np.where(
#             (right == 1) & (left == 0), 'R',
#             np.where((left == 0) & (right == 0), 'N', 'B')
#         )
#     )
#     sample_durations = compute_sample_durations(g_sorted['times'])
#     if sample_durations.size < len(g_sorted):
#         pad_value = sample_durations[-1] if sample_durations.size else 0.0
#         sample_durations = np.pad(sample_durations, (0, len(g_sorted) - sample_durations.size), constant_values=pad_value)
#     cleaned_states = merge_short_episodes(raw_states, sample_durations, threshold_ms)
#     g_sorted['clean_state'] = cleaned_states
#     g_sorted['sample_duration_ms'] = sample_durations
#     return g_sorted

# # Aggregate results for all participants except those to remove
# et_results = []
# for csv_file in glob(os.path.join(et_dir, '*.csv')):
#     participant_id = os.path.splitext(os.path.basename(csv_file))[0]
#     if participant_id in list_of_participants_to_remove:
#         continue
#     df, trial_col = parse_eyetracking_file(csv_file)
#     for trial, g in df.groupby(trial_col):
#         g_clean = clean_gaze_segments(g, clean_threshold_ms)
#         if g_clean.empty:
#             continue

#         states_array = g_clean['clean_state'].to_numpy(dtype=object)
#         durations_ms = g_clean['sample_duration_ms'].to_numpy(dtype=float)
#         times = g_clean['times'].to_numpy(dtype=float)

#         raw_dwell_left = float(durations_ms[states_array == 'L'].sum()) / 1000.0
#         raw_dwell_right = float(durations_ms[states_array == 'R'].sum()) / 1000.0
#         total_dwell = raw_dwell_left + raw_dwell_right
#         prop_left = raw_dwell_left / total_dwell if total_dwell > 0 else np.nan
#         prop_right = raw_dwell_right / total_dwell if total_dwell > 0 else np.nan
#         dominant = 'left' if raw_dwell_left > raw_dwell_right else 'right'

#         transitions = 0
#         prev_state = None
#         for state in states_array:
#             if state not in ('L', 'R'):
#                 continue
#             if prev_state is not None and state != prev_state:
#                 transitions += 1
#             prev_state = state

#         if states_array.size > 0:
#             change_idx = np.where(states_array[1:] != states_array[:-1])[0] + 1
#             bounds = np.concatenate(([0], change_idx, [len(states_array)]))
#         else:
#             bounds = np.array([0, 0])

#         fixations_left = 0
#         fixations_right = 0
#         for start, end in zip(bounds[:-1], bounds[1:]):
#             state = states_array[start]
#             if state == 'L':
#                 fixations_left += 1
#             elif state == 'R':
#                 fixations_right += 1

#         revisits_left = max(0, fixations_left - 1)
#         revisits_right = max(0, fixations_right - 1)

#         fix_indices = np.where(np.isin(states_array, ['L', 'R']))[0]
#         if fix_indices.size > 0:
#             first_idx = int(fix_indices[0])
#             time0 = float(times[0])
#             tff = (float(times[first_idx]) - time0) / 1000.0
#             ffl = 'left' if states_array[first_idx] == 'L' else 'right'
#         else:
#             tff = np.nan
#             ffl = None

#         et_results.append({
#             'participant_id': participant_id,
#             'trial': trial,
#             'dominant': dominant,
#             'prop_left': float(round(prop_left, 2)) if not np.isnan(prop_left) else np.nan,
#             'prop_right': float(round(prop_right, 2)) if not np.isnan(prop_right) else np.nan,
#             'total_dwell': float(round(total_dwell, 2)),
#             'dwell_left': float(round(raw_dwell_left, 2)),
#             'dwell_right': float(round(raw_dwell_right, 2)),
#             'n_fix_left': int(fixations_left),
#             'n_fix_right': int(fixations_right),
#             'transitions': int(transitions),
#             'tff': float(round(tff, 3)) if not np.isnan(tff) else np.nan,
#             'ffl': ffl,
#             'fixations_left': int(fixations_left),
#             'fixations_right': int(fixations_right),
#             'fixation_count_total': int(fixations_left + fixations_right),
#             'revisits_left': int(revisits_left),
#             'revisits_right': int(revisits_right),
#             'revisits_total': int(revisits_left + revisits_right)
#         })

# et_df = pd.DataFrame(et_results)

# # Remove certain participants
# et_df = et_df[~et_df['participant_id'].isin(list_of_participants_to_remove)]
# et_df

## Merge ET and behavioral data

In [160]:
# # Overwrite 'trial' values in et_df per participant using the order from region_mean/trial_index
# for participant in et_df['participant_id'].unique():
#     # Get the ordered list of trial numbers for this participant from region_mean
#     # region_mean index: (results_time, participant_id, group, trial, label, no, item, condition, cb, left, right)
#     # Extract trial numbers for this participant
#     idx = region_mean.index
#     participant_trials = [i[3] for i in idx if i[1] == participant]
    
#     # Get indices in et_df for this participant
#     mask = et_df['participant_id'] == participant
#     n_trials = mask.sum()
    
#     # Only overwrite if counts match
#     if len(participant_trials) == n_trials:
#         et_df.loc[mask, 'trial'] = participant_trials
#     else:
#         print(f"Warning: trial count mismatch for {participant} (region_mean: {len(participant_trials)}, et_df: {n_trials})")

In [161]:
# Merge eyetracking data with longform data on participant_id & trial
et_df = pd.merge(
    longform,
    et_df,
    how='left',
    left_on=['participant_id', 'trial'],
    right_on=['participant_id', 'trial']
)


# Ensure any timezone-aware datetimes are made timezone-naive before writing to Excel
from pandas import DatetimeTZDtype
import datetime

def _series_is_tz_aware(s, sample_n=20):
    """Return True if Series `s` appears to be timezone-aware.
    First checks dtype (preferred), then falls back to inspecting a small sample of Python datetimes.
    """
    if isinstance(s.dtype, DatetimeTZDtype):
        return True
    # fallback: examine a small sample for tzinfo on python datetimes
    if s.dtype == object:
        sample = s.dropna().head(sample_n)
        for v in sample:
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                return True
    return False

for c in et_df.columns:
    try:
        if _series_is_tz_aware(et_df[c]):
            # If dtype is pandas tz-aware datetime, convert to UTC then drop tz
            if isinstance(et_df[c].dtype, DatetimeTZDtype):
                et_df[c] = et_df[c].dt.tz_convert('UTC').dt.tz_localize(None)
            else:
                # fallback: coerce via to_datetime(utc=True) then drop tz
                et_df[c] = pd.to_datetime(et_df[c], utc=True).dt.tz_localize(None)
    except Exception:
        # If conversion fails for any column, leave it as-is but warn for debugging
        print(f"Warning: failed to convert timezone-aware column '{c}' — leaving as-is")


# Sort et_df by participant_id, then no
et_df = et_df.sort_values(by=['participant_id', 'no']).reset_index(drop=True)

# Save et_df as results.xlsx and results.csv
et_df.to_excel("results.xlsx", index=False)
et_df.to_csv("results.csv", index=False)

et_df

Unnamed: 0,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
0,2025-10-08 11:45:02,59679c319febf80001d53655,a,36,experiment,1,1,1,exclusive,n,...,2.0,3.0,0.0,right,2.0,2.0,4.0,1.0,1.0,2.0
1,2025-10-08 11:45:02,59679c319febf80001d53655,a,50,experiment,6,2,1,contrastive,y,...,1.0,1.0,0.0,right,1.0,1.0,2.0,0.0,0.0,0.0
2,2025-10-08 11:45:02,59679c319febf80001d53655,a,18,experiment,8,3,1,focus,y,...,1.0,1.0,0.0,right,1.0,1.0,2.0,0.0,0.0,0.0
3,2025-10-08 11:45:02,59679c319febf80001d53655,a,43,experiment,10,4,1,exclusive,y,...,1.0,0.0,0.0,right,0.0,1.0,1.0,0.0,0.0,0.0
4,2025-10-08 11:45:02,59679c319febf80001d53655,a,10,experiment,15,5,1,contrastive,n,...,2.0,3.0,0.0,right,2.0,2.0,4.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,29,experiment,41,14,1,focus,n,...,6.0,9.0,0.0,right,5.0,6.0,11.0,4.0,5.0,9.0
536,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,15,experiment,43,15,1,exclusive,n,...,1.0,1.0,0.0,right,5.0,1.0,6.0,4.0,0.0,4.0
537,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,25,experiment,48,16,1,contrastive,y,...,3.0,3.0,0.0,right,2.0,3.0,5.0,1.0,2.0,3.0
538,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,24,experiment,50,17,1,focus,y,...,2.0,2.0,0.0,right,1.0,2.0,3.0,0.0,1.0,1.0


In [162]:
# Show how many rows of et_df have "59679c319febf80001d53655" as participant
et_df[et_df['participant_id'] == '59679c319febf80001d53655']

Unnamed: 0,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
0,2025-10-08 11:45:02,59679c319febf80001d53655,a,36,experiment,1,1,1,exclusive,n,...,2.0,3.0,0.0,right,2.0,2.0,4.0,1.0,1.0,2.0
1,2025-10-08 11:45:02,59679c319febf80001d53655,a,50,experiment,6,2,1,contrastive,y,...,1.0,1.0,0.0,right,1.0,1.0,2.0,0.0,0.0,0.0
2,2025-10-08 11:45:02,59679c319febf80001d53655,a,18,experiment,8,3,1,focus,y,...,1.0,1.0,0.0,right,1.0,1.0,2.0,0.0,0.0,0.0
3,2025-10-08 11:45:02,59679c319febf80001d53655,a,43,experiment,10,4,1,exclusive,y,...,1.0,0.0,0.0,right,0.0,1.0,1.0,0.0,0.0,0.0
4,2025-10-08 11:45:02,59679c319febf80001d53655,a,10,experiment,15,5,1,contrastive,n,...,2.0,3.0,0.0,right,2.0,2.0,4.0,1.0,1.0,2.0
5,2025-10-08 11:45:02,59679c319febf80001d53655,a,13,experiment,17,6,1,focus,n,...,1.0,2.0,0.0,left,2.0,1.0,3.0,1.0,0.0,1.0
6,2025-10-08 11:45:02,59679c319febf80001d53655,a,49,experiment,19,7,1,exclusive,n,...,1.0,1.0,0.0,right,1.0,1.0,2.0,0.0,0.0,0.0
7,2025-10-08 11:45:02,59679c319febf80001d53655,a,47,experiment,24,8,1,contrastive,y,...,1.0,0.0,0.0,right,0.0,1.0,1.0,0.0,0.0,0.0
8,2025-10-08 11:45:02,59679c319febf80001d53655,a,35,experiment,26,9,1,focus,y,...,1.0,1.0,0.0,left,1.0,1.0,2.0,0.0,0.0,0.0
9,2025-10-08 11:45:02,59679c319febf80001d53655,a,45,experiment,28,10,1,exclusive,y,...,1.0,0.0,0.0,right,0.0,1.0,1.0,0.0,0.0,0.0


In [163]:
# Show how many rows of et_df have "1" as item
et_df[et_df['item'] == 1]

Unnamed: 0,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
0,2025-10-08 11:45:02,59679c319febf80001d53655,a,36,experiment,1,1,1,exclusive,n,...,2.0,3.0,0.0,right,2.0,2.0,4.0,1.0,1.0,2.0
18,2025-10-06 21:02:00,5b93d1913dca6000012c5fdc,c,33,experiment,3,1,1,contrastive,n,...,1.0,0.0,0.0,right,0.0,1.0,1.0,0.0,0.0,0.0
36,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,36,experiment,1,1,1,exclusive,n,...,4.0,1.0,0.0,right,2.0,4.0,6.0,1.0,3.0,4.0
54,2025-09-25 17:42:12,5d4fe6a2ffbcf800019d5e54,c,33,experiment,3,1,1,contrastive,n,...,3.0,0.0,0.0,right,0.0,3.0,3.0,0.0,2.0,2.0
72,2025-09-22 18:23:58,5dade76a4860f70017f70ec5,c,33,experiment,3,1,1,contrastive,n,...,2.0,4.0,0.0,left,3.0,2.0,5.0,2.0,1.0,3.0
90,2025-09-25 08:49:56,5e3b29dc87243b34bde5abfa,a,36,experiment,1,1,1,exclusive,n,...,1.0,1.0,0.0,left,1.0,1.0,2.0,0.0,0.0,0.0
108,2025-09-29 12:25:35,5ee75f3d1a88450293a38aeb,b,34,experiment,2,1,1,focus,y,...,1.0,0.0,0.0,right,0.0,1.0,1.0,0.0,0.0,0.0
126,2025-09-29 11:58:01,5f3013e31c8a690aacb02c31,a,36,experiment,1,1,1,exclusive,n,...,1.0,2.0,0.0,left,5.0,1.0,6.0,4.0,0.0,4.0
144,2025-09-25 06:25:58,5f338ba6ea047119dbd6e49e,c,33,experiment,3,1,1,contrastive,n,...,2.0,0.0,0.0,right,0.0,2.0,2.0,0.0,1.0,1.0
162,2025-10-10 10:19:08,5f3ce934a12769b771503625,c,33,experiment,3,1,1,contrastive,n,...,2.0,2.0,0.0,right,1.0,2.0,3.0,0.0,1.0,1.0


## Plot ET results

### Classified samples

#### All trials

In [164]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import os

transparent_color = 'hsva(0, 0%, 100%, 0)'

# Facet by condition into rows (one condition per row)
conditions = list(longform['condition'].dropna().unique())
n_rows = max(1, len(conditions))

fig = make_subplots(
    rows=n_rows, cols=1,
    shared_xaxes=True,
    subplot_titles=conditions,
    vertical_spacing=0.02
)

# Track per-condition ordering and metadata for categorical y-axes
facet_yticks = {cond: [] for cond in conditions}
facet_tick_meta = {cond: {} for cond in conditions}

fixations_for_plot = (
    idt_fixations
    .copy()
    .loc[~idt_fixations['participant_id'].isin(list_of_participants_to_remove)]
)
fixations_for_plot['AOI'] = fixations_for_plot['AOI'].astype(str).str.lower()
fixations_for_plot['start_time'] = pd.to_numeric(fixations_for_plot['start_time'], errors='coerce')
fixations_for_plot['end_time'] = pd.to_numeric(fixations_for_plot['end_time'], errors='coerce')
fixations_for_plot['duration_ms'] = pd.to_numeric(fixations_for_plot['duration_ms'], errors='coerce')
fixations_for_plot = fixations_for_plot.dropna(subset=['participant_id', 'trial', 'start_time', 'end_time'])
fixations_for_plot['duration_ms'] = fixations_for_plot['duration_ms'].fillna(
    fixations_for_plot['end_time'] - fixations_for_plot['start_time']
)
fixations_for_plot = fixations_for_plot[fixations_for_plot['duration_ms'] > 0]
fixations_for_plot['start_s'] = fixations_for_plot['start_time'] / 1000.0
fixations_for_plot['duration_s'] = fixations_for_plot['duration_ms'] / 1000.0
fixations_for_plot['trial_key'] = fixations_for_plot['trial'].astype(str)

color_map = {
    'left': '#5e81ac',
    'right': '#a3be8c',
    'none': dark[2],
    'neutral': dark[3]
}

side_label_map = {
    'left': 'Left',
    'right': 'Right',
    'none': 'None',
    'neutral': 'None'
}

for participant in [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]:
    df_part = et_df[et_df['participant_id'] == participant]
    sorted_trials = sorted(
        df_part['trial'].unique(),
        key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
    )
    for trial in sorted_trials:
        lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        if 'item' in lf_row.columns and pd.notna(lf_row['item'].iloc[0]):
            item_label = lf_row['item'].iloc[0]
        else:
            item_label = trial
        label = f"{participant} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        trial_fix = (
            fixations_for_plot[
                (fixations_for_plot['participant_id'] == participant) &
                (fixations_for_plot['trial_key'] == str(trial))
            ]
            .sort_values(['start_time', 'end_time'])
        )

        if trial_fix.empty:
            continue

        row_idx = conditions.index(cond) + 1
        for _, fix in trial_fix.iterrows():
            aoi = fix['AOI']
            color = color_map.get(aoi, "red")
            side = side_label_map.get(aoi, aoi.title())
            seg_time = float(fix['duration_s'])
            seg_start_s = float(fix['start_s'])

            customdata = [[
                participant,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

# Format y-axis ticks per condition
for i, cond in enumerate(conditions, start=1):
    labels = facet_yticks[cond]

    def format_label(lbl):
        meta = facet_tick_meta[cond].get(lbl, {})
        pid_short = meta.get('pid_short', 'NA')
        choice_val = meta.get('choice', 'NA')
        base = f"{pid_short} · {choice_val}"
        bold = meta.get('dominant_matches', False)
        mismatch = meta.get('mismatch', False)

        if bold:
            base = f"<span style='color:#88c0d0'>{base}</span>"
        if mismatch:
            inner = f"<s>{base}</s>"
        else:
            inner = base
        return inner

    ticktext = [format_label(t) for t in labels]
    fig.update_yaxes(
        categoryorder='array',
        categoryarray=labels,
        tickvals=labels,
        ticktext=ticktext,
        showgrid=False,
        ticks='',
        row=i, col=1
    )

fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

fig.update_layout(
    template='nord_dark',
    barmode='stack',
    bargap=0.1,
    bargroupgap=0.0,
    title='Eye-Tracking: Stacked Dwell Time Segments (Transitions)',
    height=max(4800, 400 * n_rows),
    # width=1000,
    showlegend=True
)

fig.show()

fig.write_html(plots / "dwell_times.html", include_plotlyjs='cdn')
fig.write_image(plots / "dwell_times.png", scale=3)

#### Per Item

In [165]:
def plot_item_dwell(item_value,
                    longform_df=None,
                    et_summary_df=None,
                    fixations_df=None,
                    skip_ids=None,
                    template='nord_dark',
                    save=True,
                    height_per_row=200):
    
    lf = longform_df if longform_df is not None else globals().get('longform')
    et_summary = et_summary_df if et_summary_df is not None else globals().get('et_df')
    fixations = fixations_df if fixations_df is not None else globals().get('idt_fixations')
    plots_dir = globals().get('plots')
    skip_ids = set(skip_ids if skip_ids is not None else globals().get('list_of_participants_to_remove', []))

    if lf is None or 'item' not in lf.columns:
        raise ValueError("longform DataFrame with an 'item' column is required.")
    if et_summary is None or fixations is None:
        raise ValueError("et_summary_df and fixations_df (or globals et_df / idt_fixations) are required.")

    lf_item = lf.loc[lf['item'].astype(str) == str(item_value)].copy()
    lf_item = lf_item[~lf_item['participant_id'].isin(skip_ids)]
    if lf_item.empty:
        raise ValueError(f"No trials found for item {item_value!r}.")

    conditions = list(lf_item['condition'].dropna().unique())
    if not conditions:
        raise ValueError(f"No conditions available for item {item_value!r}.")
    n_rows = max(1, len(conditions))

    fig = make_subplots(
        rows=n_rows,
        cols=1,
        shared_xaxes=True,
        subplot_titles=conditions,
        vertical_spacing=0.08
    )

    facet_yticks = {cond: [] for cond in conditions}
    facet_tick_meta = {cond: {} for cond in conditions}

    fix_plot = (
        fixations.copy()
        .loc[~fixations['participant_id'].isin(skip_ids)]
    )
    fix_plot['AOI'] = fix_plot['AOI'].astype(str).str.lower()
    fix_plot['start_time'] = pd.to_numeric(fix_plot['start_time'], errors='coerce')
    fix_plot['end_time'] = pd.to_numeric(fix_plot['end_time'], errors='coerce')
    fix_plot['duration_ms'] = pd.to_numeric(fix_plot['duration_ms'], errors='coerce')
    fix_plot = fix_plot.dropna(subset=['participant_id', 'trial', 'start_time', 'end_time'])
    fix_plot['duration_ms'] = fix_plot['duration_ms'].fillna(fix_plot['end_time'] - fix_plot['start_time'])
    fix_plot = fix_plot[fix_plot['duration_ms'] > 0]
    fix_plot['start_s'] = fix_plot['start_time'] / 1000.0
    fix_plot['duration_s'] = fix_plot['duration_ms'] / 1000.0
    fix_plot['trial_key'] = fix_plot['trial'].astype(str)
    fix_plot['pair_key'] = fix_plot['participant_id'].astype(str) + '||' + fix_plot['trial_key']

    selected_pairs = set(
        (lf_item['participant_id'].astype(str) + '||' + lf_item['trial'].astype(str)).dropna()
    )
    fix_plot = fix_plot[fix_plot['pair_key'].isin(selected_pairs)].copy()
    fix_plot.drop(columns=['pair_key'], inplace=True)

    item_participants = lf_item['participant_id'].unique()
    color_map = {
        'left': '#5e81ac',
        'right': '#a3be8c',
        'none': dark[2],
        'neutral': dark[3]
    }
    side_label_map = {
        'left': 'Left',
        'right': 'Right',
        'none': 'None',
        'neutral': 'None'
    }

    for participant in item_participants:
        lf_part = lf_item[lf_item['participant_id'] == participant]
        trials = sorted(
            lf_part['trial'].dropna().unique(),
            key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
        )
        df_part = et_summary[et_summary['participant_id'] == participant]

        for trial in trials:
            lf_row = lf_part[lf_part['trial'] == trial]
            if lf_row.empty:
                continue
            cond = lf_row['condition'].iloc[0]
            if pd.isna(cond) or cond not in conditions:
                continue

            item_label = lf_row['item'].iloc[0]
            label = f"{participant} - Item {item_label}"
            if label not in facet_yticks[cond]:
                facet_yticks[cond].append(label)

            et_row = df_part[df_part['trial'] == trial]
            dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
            choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

            mismatch_flag = False
            if 'mismatch' in lf_row.columns:
                try:
                    mismatch_flag = bool(lf_row['mismatch'].iloc[0])
                except Exception:
                    mismatch_flag = False

            facet_tick_meta[cond][label] = {
                'pid_short': str(participant)[:5],
                'choice': choice if pd.notna(choice) else 'NA',
                'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
                'mismatch': mismatch_flag
            }

            trial_fix = (
                fix_plot[
                    (fix_plot['participant_id'] == participant) &
                    (fix_plot['trial_key'] == str(trial))
                ]
                .sort_values(['start_time', 'end_time'])
            )
            if trial_fix.empty:
                continue

            row_idx = conditions.index(cond) + 1
            for _, fix in trial_fix.iterrows():
                aoi = fix['AOI']
                color = color_map.get(aoi, "red")
                side = side_label_map.get(aoi, aoi.title())
                seg_time = float(fix['duration_s'])
                seg_start_s = float(fix['start_s'])

                customdata = [[
                    participant,
                    item_label,
                    trial,
                    cond,
                    side,
                    seg_start_s,
                    seg_time,
                    dominant or 'N/A',
                    choice or 'N/A',
                    'Yes' if mismatch_flag else 'No'
                ]]
                fig.add_trace(
                    go.Bar(
                        x=[seg_time],
                        y=[label],
                        name=side,
                        marker_color=color,
                        orientation='h',
                        showlegend=False,
                        customdata=customdata,
                        hovertemplate=(
                            "<b>%{customdata[0]}</b><br>"
                            "Condition: %{customdata[3]}<br>"
                            "Item: %{customdata[1]}<br>"
                            "Trial: %{customdata[2]}<br>"
                            "Side: %{customdata[4]}<br>"
                            "Segment start: %{customdata[5]:.2f}s<br>"
                            "Segment duration: %{x:.2f}s<br>"
                            "Dominant: %{customdata[7]}<br>"
                            "Choice: %{customdata[8]}<br>"
                            "Mismatch: %{customdata[9]}<extra></extra>"
                        )
                    ),
                    row=row_idx,
                    col=1
                )

    for i, cond in enumerate(conditions, start=1):
        labels = facet_yticks[cond]

        def format_label(lbl):
            meta = facet_tick_meta[cond].get(lbl, {})
            pid_short = meta.get('pid_short', 'NA')
            choice_val = meta.get('choice', 'NA')
            base = f"{pid_short} · {choice_val}"
            if meta.get('dominant_matches', False):
                base = f"<span style='color:#88c0d0'>{base}</span>"
            if meta.get('mismatch', False):
                return f"<s>{base}</s>"
            return base

        ticktext = [format_label(t) for t in labels]
        fig.update_yaxes(
            categoryorder='array',
            categoryarray=labels,
            tickvals=labels,
            ticktext=ticktext,
            showgrid=False,
            ticks='',
            row=i,
            col=1
        )

    fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)
    fig.update_layout(
        template=template,
        barmode='stack',
        bargap=0.1,
        bargroupgap=0.0,
        title=f"Eye-Tracking: Dwell Time Segments · Item {item_value}",
        height=max(600, height_per_row * n_rows),
        showlegend=True
    )

    if save and plots_dir is not None:
        fig.write_html(plots_dir / f"dwell_times_item_{item_value}.html", include_plotlyjs='cdn')
        try:
            fig.write_image(plots_dir / f"dwell_times_item_{item_value}.png", scale=3)
        except Exception:
            pass

    # fig.show()
    return fig

# Usage
plot_item_dwell(
    item_value='1',
    longform_df=longform,
    et_summary_df=et_df,
    fixations_df=idt_fixations,
    skip_ids=list_of_participants_to_remove,
    template='nord_dark',
    save=True,
    height_per_row=200
)

#### Per participant

In [166]:
def plot_single_participant_dwell(participant_id,
                                  longform_df=None,
                                  et_summary_df=None,
                                  fixations_df=None,
                                  skip_ids=None,
                                  template='nord_dark',
                                  save=True,
                                  height_per_row=220):
    lf = longform_df if longform_df is not None else globals().get('longform')
    et_summary = et_summary_df if et_summary_df is not None else globals().get('et_df')
    fixations = fixations_df if fixations_df is not None else globals().get('idt_fixations')
    plots_dir = globals().get('plots')
    skip_ids = set(skip_ids if skip_ids is not None else globals().get('list_of_participants_to_remove', []))

    if lf is None or et_summary is None or fixations is None:
        raise ValueError("Provide longform_df, et_summary_df and fixations_df or ensure globals exist.")
    if participant_id in skip_ids:
        raise ValueError(f"Participant {participant_id!r} is flagged for removal.")

    lf_participant = lf[lf['participant_id'] == participant_id].copy()
    if lf_participant.empty:
        raise ValueError(f"No trials for participant {participant_id!r}.")

    conditions = list(lf_participant['condition'].dropna().unique())
    if not conditions:
        raise ValueError(f"No conditions available for participant {participant_id!r}.")
    n_rows = max(1, len(conditions))

    fig = make_subplots(
        rows=n_rows,
        cols=1,
        shared_xaxes=True,
        subplot_titles=conditions,
        vertical_spacing=0.08
    )

    facet_yticks = {cond: [] for cond in conditions}
    facet_tick_meta = {cond: {} for cond in conditions}

    fix_plot = (
        fixations
        .copy()
        .loc[fixations['participant_id'] == participant_id]
    )
    fix_plot['AOI'] = fix_plot['AOI'].astype(str).str.lower()
    fix_plot['start_time'] = pd.to_numeric(fix_plot['start_time'], errors='coerce')
    fix_plot['end_time'] = pd.to_numeric(fix_plot['end_time'], errors='coerce')
    fix_plot['duration_ms'] = pd.to_numeric(fix_plot['duration_ms'], errors='coerce')
    fix_plot = fix_plot.dropna(subset=['start_time', 'end_time', 'trial'])
    fix_plot['duration_ms'] = fix_plot['duration_ms'].fillna(fix_plot['end_time'] - fix_plot['start_time'])
    fix_plot = fix_plot[fix_plot['duration_ms'] > 0]
    fix_plot['start_s'] = fix_plot['start_time'] / 1000.0
    fix_plot['duration_s'] = fix_plot['duration_ms'] / 1000.0
    fix_plot['trial_key'] = fix_plot['trial'].astype(str)

    color_map = {
        'left': '#5e81ac',
        'right': '#a3be8c',
        'none': dark[2],
        'neutral': dark[3]
    }
    side_label_map = {
        'left': 'Left',
        'right': 'Right',
        'none': 'None',
        'neutral': 'None'
    }

    sorted_trials = sorted(
        lf_participant['trial'].dropna().unique(),
        key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
    )
    et_part = et_summary[et_summary['participant_id'] == participant_id]

    for trial in sorted_trials:
        lf_row = lf_participant[lf_participant['trial'] == trial]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        item_label = lf_row['item'].iloc[0] if 'item' in lf_row.columns else trial
        # label on y-axis remains Item + Trial (unchanged)
        label = f"Item {item_label} · Trial {trial}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = et_part[et_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        # get the 'no' value from longform if available (fall back to 'NA')
        if 'no' in lf_row.columns and pd.notna(lf_row['no'].iloc[0]):
            no_val = lf_row['no'].iloc[0]
        else:
            no_val = 'NA'

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag,
            'no': no_val
        }

        trial_fix = (
            fix_plot[fix_plot['trial_key'] == str(trial)]
            .sort_values(['start_time', 'end_time'])
        )
        if trial_fix.empty:
            continue

        row_idx = conditions.index(cond) + 1
        for _, fix in trial_fix.iterrows():
            aoi = fix['AOI']
            color = color_map.get(aoi, "red")
            side = side_label_map.get(aoi, aoi.title())  # still used for color/logic if needed
            seg_time = float(fix['duration_s'])
            seg_start_s = float(fix['start_s'])

            # customdata now contains 'no' at index 4 instead of side
            customdata = [[
                participant_id,   # 0
                item_label,       # 1
                trial,            # 2
                cond,             # 3
                no_val,           # 4  <- 'no' replaces 'side' here
                seg_start_s,      # 5
                seg_time,         # 6
                dominant or 'N/A',# 7
                choice or 'N/A',  # 8
                'Yes' if mismatch_flag else 'No'  # 9
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "No: %{customdata[4]}<br>"                      # replaced Side with No
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx,
                col=1
            )

    for i, cond in enumerate(conditions, start=1):
        labels = facet_yticks[cond]

        def format_label(lbl):
            meta = facet_tick_meta[cond].get(lbl, {})
            base = f"{meta.get('choice', 'NA')}"
            if meta.get('dominant_matches', False):
                base = f"<span style='color:#88c0d0'>{base}</span>"
            if meta.get('mismatch', False):
                return f"<s>{base}</s>"
            return base

        ticktext = [format_label(t) for t in labels]
        fig.update_yaxes(
            categoryorder='array',
            categoryarray=labels,
            tickvals=labels,
            ticktext=ticktext,
            showgrid=False,
            ticks='',
            row=i,
            col=1
        )

    fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)
    fig.update_layout(
        template=template,
        barmode='stack',
        bargap=0.1,
        bargroupgap=0.0,
        title=f"Eye-Tracking: Dwell Time Segments · Participant {participant_id}",
        height=max(600, height_per_row * n_rows),
        showlegend=True
    )

    if save and plots_dir is not None:
        fig.write_html(plots_dir / f"dwell_times_{participant_id}.html", include_plotlyjs='cdn')
        try:
            fig.write_image(plots_dir / f"dwell_times_{participant_id}.png", scale=3)
        except Exception:
            pass

    return fig

# Usage
plot_single_participant_dwell(
    participant_id='59679c319febf80001d53655',
    longform_df=longform,
    et_summary_df=et_df,
    fixations_df=idt_fixations,
    skip_ids=list_of_participants_to_remove,
    template='nord_dark',
    save=True,
    height_per_row=200
)

### Raw samples

#### All trials

In [167]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import os

transparent_color = 'hsva(0, 0%, 100%, 0)'

# Facet by condition into rows (one condition per row)
conditions = list(longform['condition'].dropna().unique())
n_rows = max(1, len(conditions))

fig = make_subplots(
    rows=n_rows, cols=1,
    shared_xaxes=True,
    subplot_titles=conditions,
    vertical_spacing=0.02
)

# Track per-condition ordering and metadata for categorical y-axes
facet_yticks = {cond: [] for cond in conditions}
facet_tick_meta = {cond: {} for cond in conditions}

for participant in [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]:
    df_part = et_df[et_df['participant_id'] == participant]
    sorted_trials = sorted(
        df_part['trial'].unique(),
        key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
    )
    for trial in sorted_trials:
        lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        if 'item' in lf_row.columns and pd.notna(lf_row['item'].iloc[0]):
            item_label = lf_row['item'].iloc[0]
        else:
            item_label = trial
        label = f"{participant} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        et_file = os.path.join(et_dir, f"{participant}.csv")
        if not os.path.exists(et_file):
            continue
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()
        if g.empty:
            continue

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000
                seg_start_s = start_time / 1000
            else:
                seg_time = 0
                seg_start_s = np.nan

            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                color = '#5e81ac'
                side = 'Left'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                color = '#a3be8c'
                side = 'Right'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                color = dark[3]
                side = 'None'
            else:
                color = "red"
                side = 'Both'

            row_idx = conditions.index(cond) + 1
            customdata = [[
                participant,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

# Format y-axis ticks per condition
for i, cond in enumerate(conditions, start=1):
    labels = facet_yticks[cond]

    def format_label(lbl):
        meta = facet_tick_meta[cond].get(lbl, {})
        pid_short = meta.get('pid_short', 'NA')
        choice_val = meta.get('choice', 'NA')
        base = f"{pid_short} · {choice_val}"
        bold = meta.get('dominant_matches', False)
        mismatch = meta.get('mismatch', False)

        if bold:
            base = f"<span style='color:#88c0d0'>{base}</span>"
        if mismatch:
            inner = f"<s>{base}</s>"
        else:
            inner = base
        return inner

    ticktext = [format_label(t) for t in labels]
    fig.update_yaxes(
        categoryorder='array',
        categoryarray=labels,
        tickvals=labels,
        ticktext=ticktext,
        showgrid=False,
        ticks='',
        row=i, col=1
    )

fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

fig.update_layout(
    template='nord_dark',
    barmode='stack',
    bargap=0.1,
    bargroupgap=0.0,
    title='Eye-Tracking: Stacked Dwell Time Segments (Transitions)',
    height=max(4400, 400 * n_rows),
    # width=1000,
    showlegend=True
)

fig.show()

fig.write_html(plots / "raw_dwell_times.html", include_plotlyjs='cdn')
fig.write_image(plots / "raw_dwell_times.png", scale=3)

#### Per item

In [168]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import os

transparent_color = 'hsva(0, 0%, 100%, 0)'

# Facet by condition into rows (one condition per row)
conditions = list(longform['condition'].dropna().unique())
n_rows = max(1, len(conditions))

fig = make_subplots(
    rows=n_rows, cols=1,
    shared_xaxes=True,
    subplot_titles=conditions,
    vertical_spacing=0.1
)

# Track per-condition ordering and metadata for categorical y-axes
facet_yticks = {cond: [] for cond in conditions}
facet_tick_meta = {cond: {} for cond in conditions}

for participant in [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]:
    df_part = et_df[et_df['participant_id'] == participant]
    sorted_trials = sorted(
        df_part['trial'].unique(),
        key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
    )
    for trial in sorted_trials:
        lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        item_value = lf_row['item'].iloc[0] if 'item' in lf_row.columns else None
        try:
            item_numeric = float(item_value)
        except (TypeError, ValueError):
            item_numeric = None
        if item_numeric != 1:
            continue
        item_label = item_value

        label = f"{participant} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        et_file = os.path.join(et_dir, f"{participant}.csv")
        if not os.path.exists(et_file):
            continue
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()
        if g.empty:
            continue

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000
                seg_start_s = start_time / 1000
            else:
                seg_time = 0
                seg_start_s = np.nan

            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                color = '#5e81ac'
                side = 'Left'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                color = '#a3be8c'
                side = 'Right'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                color = dark[3]
                side = 'None'
            else:
                color = "red"
                side = 'Both'

            row_idx = conditions.index(cond) + 1
            customdata = [[
                participant,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

# Format y-axis ticks per condition
for i, cond in enumerate(conditions, start=1):
    labels = facet_yticks[cond]

    def format_label(lbl):
        meta = facet_tick_meta[cond].get(lbl, {})
        pid_short = meta.get('pid_short', 'NA')
        choice_val = meta.get('choice', 'NA')
        base = f"{pid_short} · {choice_val}"
        bold = meta.get('dominant_matches', False)
        mismatch = meta.get('mismatch', False)

        if bold:
            base = f"<span style='color:#88c0d0'>{base}</span>"
        if mismatch:
            inner = f"<s>{base}</s>"
        else:
            inner = base
        return inner

    ticktext = [format_label(t) for t in labels]
    fig.update_yaxes(
        categoryorder='array',
        categoryarray=labels,
        tickvals=labels,
        ticktext=ticktext,
        showgrid=False,
        ticks='',
        row=i, col=1
    )

fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

fig.update_layout(
    template='nord_dark',
    barmode='stack',
    bargap=0.1,
    bargroupgap=0.0,
    title='Eye-Tracking: Stacked Dwell Time Segments (Transitions) per Item',
    # height=max(400, 100 * n_rows),
    width=600,
    height=600,
    showlegend=True
)

fig.show()

fig.write_html(plots / "raw_dwell_times_per_item.html", include_plotlyjs='cdn')
fig.write_image(plots / "raw_dwell_times_per_item.png", scale=3)

#### Per participant

In [169]:
def plot_participant_dwell(participant_id,
                           longform_df=None,
                           et_df_df=None,
                           et_dir_path=None,
                           plots_path=None,
                           transparent_color='hsva(0, 0%, 100%, 0)'):
    """
    Plot stacked dwell-time segments for a single participant and save results.
    Returns the Plotly Figure.
    """
    # fall back to globals if not provided
    lf = longform_df if longform_df is not None else globals().get('longform')
    et = et_df_df if et_df_df is not None else globals().get('et_df')
    et_dir_local = et_dir_path if et_dir_path is not None else globals().get('et_dir')
    plots_dir = plots_path if plots_path is not None else globals().get('plots')

    if lf is None or et is None or et_dir_local is None or plots_dir is None:
        raise ValueError("Provide longform_df, et_df_df, et_dir_path and plots_path or ensure globals exist.")

    conditions = list(lf['condition'].dropna().unique())
    n_rows = max(1, len(conditions))

    fig = make_subplots(rows=n_rows, cols=1, shared_xaxes=True, subplot_titles=conditions, vertical_spacing=0.1)

    facet_yticks = {cond: [] for cond in conditions}
    facet_tick_meta = {cond: {} for cond in conditions}

    participants_available = [p for p in et['participant_id'].unique() if p not in globals().get('list_of_participants_to_remove', [])]
    if participant_id not in participants_available:
        raise ValueError(f"Participant id {participant_id!r} not available. Available ids: {participants_available[:10]}")

    df_part = et[et['participant_id'] == participant_id]
    sorted_trials = sorted(df_part['trial'].unique(), key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x)

    for trial in sorted_trials:
        lf_row = lf[(lf['participant_id'] == participant_id) & (lf['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        item_label = lf_row['item'].iloc[0] if ('item' in lf_row.columns and pd.notna(lf_row['item'].iloc[0])) else trial
        label = f"{participant_id} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant_id)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        et_file = os.path.join(et_dir_local, f"{participant_id}.csv")
        if not os.path.exists(et_file):
            continue
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()
        if g.empty:
            continue

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000
                seg_start_s = start_time / 1000
            else:
                seg_time = 0
                seg_start_s = np.nan

            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                color = '#5e81ac'
                side = 'Left'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                color = '#a3be8c'
                side = 'Right'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                color = dark[3]
                side = 'None'
            else:
                color = "red"
                side = 'Both'

            row_idx = conditions.index(cond) + 1
            customdata = [[
                participant_id,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

    # Format y-axis ticks per condition
    for i, cond in enumerate(conditions, start=1):
        labels = facet_yticks[cond]

        def format_label(lbl):
            meta = facet_tick_meta[cond].get(lbl, {})
            pid_short = meta.get('pid_short', 'NA')
            choice_val = meta.get('choice', 'NA')
            base = f"{pid_short} · {choice_val}"
            bold = meta.get('dominant_matches', False)
            mismatch = meta.get('mismatch', False)

            if bold:
                base = f"<span style='color:#88c0d0'>{base}</span>"
            if mismatch:
                inner = f"<s>{base}</s>"
            else:
                inner = base
            return inner

        ticktext = [format_label(t) for t in labels]
        fig.update_yaxes(
            categoryorder='array',
            categoryarray=labels,
            tickvals=labels,
            ticktext=ticktext,
            showgrid=False,
            ticks='',
            row=i, col=1
        )

    fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

    fig.update_layout(
        template='nord_dark',
        barmode='stack',
        bargap=0.1,
        bargroupgap=0.0,
        title=f'Eye-Tracking: Stacked Dwell Time Segments (Participant {participant_id})',
        # height=max(400, 150 * n_rows),
        height = 400, 
        width = 600,
        showlegend=True
    )

    # show and save
    fig.show()
    out_html = plots_dir / f"raw_dwell_times_{participant_id}.html"
    out_png = plots_dir / f"raw_dwell_times_{participant_id}.png"
    fig.write_html(out_html, include_plotlyjs='cdn')
    try:
        fig.write_image(out_png, scale=3)
    except Exception:
        # image export may require kaleido installed
        pass

    return fig

# Example usage:
fig = plot_participant_dwell('59679c319febf80001d53655')