# Parse PCIbex results

This notebook parses the messy results file `results_dev.csv` and produces an event-level and longform DataFrame with:

- participant_id, group, label (block), no, item, condition, cb, left, right
- Self-paced reading region events (r1..r7) with per-region RTs, including mean and std per participant, item, condition, region question RTs and choice selection (side + RT)
- Eye-tracker events

In [1]:
# Imports and paths
import re
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(r"c:\\Users\\parti\\Projects\\pcibex-hun")
raw_file = ROOT / "results_prod.csv"
plots = ROOT / "plots"
plots.mkdir(exist_ok=True)

print("Will parse:", raw_file)

Will parse: c:\Users\parti\Projects\pcibex-hun\results_prod.csv


In [2]:
# Read file, split comments vs data
raw_lines = raw_file.read_text(encoding="utf-8").splitlines()

# Replace "6108da57e362f96a3ee32a88" with "something" if line number is <4000
for i, line in enumerate(raw_lines):
    if i > 5000:
        raw_lines[i] = line.replace("6108da57e362f96a3ee32a88", "6108da57e362f96a3ee32a88_2")

for i, line in enumerate(raw_lines):
    if i > 5000:
        raw_lines[i] = line.replace("5dade76a4860f70017f70ec5", "5dade76a4860f70017f70ec5_2")

header_comments = []
rows = []
for line in raw_lines:
    if line.startswith("#"):
        header_comments.append(line)
    elif line.strip():
        rows.append(line)

print(f"Comment lines: {len(header_comments)} | Data rows: {len(rows)}")
rows[:3]

Comment lines: 1536 | Data rows: 24068


['1758558839,5ea96059c7f8c9bb71d4ecdb67ea496e,PennController,0,0,welcome,NULL,PennController,0,_Trial_,Start,1758556961557,prolific_id,NULL',
 '1758558839,5ea96059c7f8c9bb71d4ecdb67ea496e,PennController,0,0,welcome,NULL,PennController,0,_Header_,Start,1758556961557,prolific_id,NULL',
 '1758558839,5ea96059c7f8c9bb71d4ecdb67ea496e,PennController,0,0,welcome,NULL,PennController,0,_Header_,End,1758556961557,prolific_id,NULL']

## Events

In [3]:
# Define the base schema described by comments just before each block
base_cols = [
    "ResultsTime", "MD5", "Controller", "Order", "Inner", "Label",
    "LatinSquare", "PennElementType", "PennElementName", "Parameter",
    "Value", "EventTime", "prolific_id",
]

def extract_trial_fields_from_comments(comments):
    # We expect something like a line that mentions these names.
    allowed = ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]
    # accept common variants (participant-id, participant id, participantid, prolific_id etc.)
    pattern = re.compile(r"(prolific[_\-\s]?id|participant[_\-\s]?id|participantid|group|no|item|exp|condition|cb|left|right|target)", re.I)

    best_line = None
    max_hits = 0
    # prefer the comment line that contains the most known tokens
    for c in comments:
        hits = len(pattern.findall(c))
        if hits > max_hits:
            best_line = c
            max_hits = hits

    # if we found a useful header line, try to parse explicit comma-separated tokens after a colon
    if best_line and max_hits >= 6:
        text = best_line.split(":", 1)[-1]
        tokens = [t.strip() for t in re.split(r"[,\t]+", text) if t.strip()]

        def norm(s):
            s2 = re.sub(r"[^A-Za-z0-9_]", "", s).lower()
            if s2 == "participantid":
                s2 = "participant_id"
            if s2 == "prolificid":
                s2 = "prolific_id"
            return s2

        names = []
        for t in tokens:
            n = norm(t)
            if n in allowed and n not in names:
                names.append(n)

        # fallback: if explicit tokens not present, use the order of regex matches in the line
        if not names:
            for m in pattern.finditer(best_line):
                n = norm(m.group(0))
                if n in allowed and n not in names:
                    names.append(n)

        # Preserve canonical order relative to allowed list
        ordered = [name for name in allowed if name in names]
        # accept extraction if we found at least participant_id + several others
        if "participant_id" in ordered and len(ordered) >= 6:
            return ordered

    # Final fallback: canonical full trial ordering (updated to include exp and target)
    return ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]

TRIAL_FIELDS = extract_trial_fields_from_comments(header_comments)
print("Using trial fields:", TRIAL_FIELDS)

# Parse each data row into a record using base_cols + TRIAL_FIELDS depending on Label

def parse_row_to_record(line: str):
    parts = [p.strip() for p in line.split(",")]
    rec = {}
    base_vals = parts[:len(base_cols)]
    extra_vals = parts[len(base_cols):]

    for k, v in zip(base_cols, base_vals):
        rec[k] = v

    # If the extra values accidentally include a literal header token as the first element
    # (some CSV dumps include the header name), drop it so alignment works.
    if extra_vals and str(extra_vals[0]).lower() in {f.lower() for f in TRIAL_FIELDS + ["prolific_id"]}:
        # drop single leading header-like token
        extra_vals = extra_vals[1:]

    label = rec.get("Label")
    if label in ("practice", "experiment"):
        n = min(len(extra_vals), len(TRIAL_FIELDS))
        for k, v in zip(TRIAL_FIELDS[:n], extra_vals[:n]):
            rec[k] = v
    elif label in ("participant_data",):
        if extra_vals:
            # participant_data frequently stores id in first extra token (after possible header token)
            rec["participant_id"] = extra_vals[0]

    return rec

records = [parse_row_to_record(l) for l in rows]
raw_df = pd.DataFrame.from_records(records)

# Cast some known numeric columns where possible
for c in ["ResultsTime", "Order", "Inner", "EventTime", "no", "item"]:
    if c in raw_df.columns:
        raw_df[c] = pd.to_numeric(raw_df[c], errors="coerce")

# Human-readable timestamps
raw_df["results_time"] = pd.to_datetime(raw_df["ResultsTime"], unit="s", utc=True)

# IMPORTANT: Keep EventTime both as numeric milliseconds and as a proper timestamp (UTC)
# - EventTime_ms: numeric milliseconds for computations (diffs, means)
# - event_time: Pandas Timestamp (UTC) for readability
raw_df["event_time_ms"] = raw_df["EventTime"]
raw_df["event_time"] = pd.to_datetime(raw_df["event_time_ms"], unit="ms", utc=True)

################################################

# Helper: derive per-trial fields now that we named extras explicitly in raw_df
# We keep a light decoder only to ensure participant_id is present and to normalize types.

EXPECTED_FIELDS = ["participant_id", "group", "no", "item", "exp", "condition", "cb", "left", "right", "target"]

# Make sure that item and no are integers
for col in ["no", "item"]:
    if col in raw_df.columns:
        # Use pandas nullable Int64 type to keep NaNs and force integer dtype
        raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce").astype('Int64')
        
# Ensure all expected columns exist even if missing in some rows
for col in EXPECTED_FIELDS:
    if col not in raw_df.columns:
        raw_df[col] = None

# Build df and forward fill participant_id/group as before
df = raw_df.copy()

# Forward-fill participant_id only within blocks of identical participant_id (no cross-over)
# (keep as-is: do not fill across participants)
if "participant_id" in df.columns:
    # nothing to do beyond keeping column present; participant ids should appear in rows where provided

    # If you need to forward-fill within contiguous blocks uncomment:
    # df['participant_id'] = df['participant_id'].ffill()

    pass

# Backward-fill group only within blocks of identical participant_id (no cross-over)
if "group" in df.columns and "participant_id" in df.columns:
    # normalize empty/NULL then backfill per participant using transform to avoid groupby.apply deprecation
    df["group"] = df["group"].replace({"": None, "NULL": None})
    df["group"] = df.groupby("participant_id")["group"].transform(lambda s: s.bfill())

# Derive block-type flags
# df["is_practice"] = df["Label"].eq("practice")
# df["is_experiment"] = df["Label"].eq("experiment")

# For convenience: also include a local-time copy if desired (commented)
# df["results_timestamp_local"] = df["results_timestamp"].dt.tz_convert("Europe/Budapest")

# Drop Controller column
df.drop(columns=["ResultsTime", "Controller", "Inner", "LatinSquare", "EventTime", "prolific_id"], inplace=True)


# Drop rows where the Parameter column starts and ends with '_'
df = df[~df["Parameter"].str.match(r"^_.*_$", na=False)]

# Remove rows where PennElementType is "Canvas"
df = df[df["PennElementType"] != "Canvas"]

# Add simple elapsed time between events: current row EventTime_ms minus previous row EventTime_ms
if 'elapsed_ms' in df.columns:
    df.drop(columns=['elapsed_ms'], inplace=True)
df['elapsed_ms'] = df['event_time_ms'].diff()
df.drop(columns=['event_time_ms'], inplace=True)

# Keep only rows where label is practice or experiment
df = df[df["Label"].isin(["practice", "experiment"])]

# In Values column rename right_canvas_practice to right_canvas
df["Value"] = df["Value"].replace({"right_canvas_practice": "right_canvas"})
df["Value"] = df["Value"].replace({"left_canvas_practice": "left_canvas"})

# Rename Label to label and Order to order
df.rename(columns={"Label": "label", "Order": "trial"}, inplace=True)

# Fix mistakes in stimuli data ##########################
# Change condition of sentences no 74 to other-directed-x
df.loc[df['no'] == 74, 'condition'] = 'other-directed-x'

# Save df as events.csv
df.to_csv("events.csv", index=False)
df

Using trial fields: ['participant_id', 'group', 'no', 'item', 'exp', 'condition', 'cb', 'left', 'right', 'target']


Unnamed: 0,MD5,trial,label,PennElementType,PennElementName,Parameter,Value,participant_id,group,no,item,exp,condition,cb,left,right,target,results_time,event_time,elapsed_ms
28,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,EyeTracker,tracker,calibration,68,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:00.501000+00:00,40689.0
29,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,EyeTracker,tracker,Filename,httpsfarmpcibexnetpzCPVqO/e48687db-1afc-905a-9...,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:26.157000+00:00,25656.0
31,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,Key,r0,PressedKey,,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:08.205000+00:00,-17952.0
32,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,Key,r1,PressedKey,,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:10.936000+00:00,2731.0
33,5ea96059c7f8c9bb71d4ecdb67ea496e,6,practice,Key,r2,PressedKey,,60d26e7cd9f0761e4d12b9f8,a,990,1,,practice,n,A,B,A,2025-09-22 16:33:59+00:00,2025-09-22 16:05:12.758000+00:00,1822.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24062,1a37521c35a559b6548b295a43bd9f1b,50,experiment,Key,r4,PressedKey,,60ed917a71051ceb8ec151c9,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-10 23:29:57+00:00,2025-10-10 23:29:50.204000+00:00,529.0
24063,1a37521c35a559b6548b295a43bd9f1b,50,experiment,Key,r5,PressedKey,,60ed917a71051ceb8ec151c9,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-10 23:29:57+00:00,2025-10-10 23:29:50.794000+00:00,590.0
24064,1a37521c35a559b6548b295a43bd9f1b,50,experiment,Key,r7,PressedKey,,60ed917a71051ceb8ec151c9,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-10 23:29:57+00:00,2025-10-10 23:29:51.280000+00:00,486.0
24065,1a37521c35a559b6548b295a43bd9f1b,50,experiment,Key,question,PressedKey,,60ed917a71051ceb8ec151c9,b,101,30,2,self-directed-x,n,diák,tanár,tanár,2025-10-10 23:29:57+00:00,2025-10-10 23:29:53.933000+00:00,2653.0


## Participants

In [4]:
def extract_participant_info(events_df, fallback_df=None):
    # Helper: return first non-empty normalized value for a column
    def pick_first(df_like, col):
        if df_like is None or col not in df_like.columns:
            return None
        s = df_like[col].replace({"": None, "NULL": None}).dropna().astype(str)
        return s.iloc[0] if len(s) > 0 else None

    # Extract participant_id, group, results_time (prefer events_df, fallback to fallback_df)
    pid = pick_first(events_df, "participant_id") or pick_first(fallback_df, "participant_id")
    group = pick_first(events_df, "group") or pick_first(fallback_df, "group")
    results_time = pick_first(events_df, "results_time") or pick_first(fallback_df, "results_time")

    # Exact requirement: Value where PennElementType == 'EyeTracker' and Parameter == 'Filename'
    def find_et_filename(df_like):
        if df_like is None or not {"PennElementType", "Parameter", "Value"}.issubset(df_like.columns):
            return None
        pet = df_like["PennElementType"].astype(str).str.lower()
        par = df_like["Parameter"].astype(str).str.lower()
        mask = pet.eq("eyetracker") & par.eq("filename")
        vals = df_like.loc[mask, "Value"].replace({"": None}).dropna().astype(str)
        return vals.iloc[0] if len(vals) > 0 else None

    et = find_et_filename(events_df) or find_et_filename(fallback_df)

    return pd.DataFrame([
        {
            "participant_id": pid,
            "group": group,
            "eyetracker_filename": et,
            "results_time": results_time,
        }
    ])

# Build participants_df for all participants (uses raw_df as fallback to find filename)
parts = []
df_nonnull = df[df['participant_id'].notna()]
for pid, g in df_nonnull.groupby('participant_id', sort=False):
    parts.append(extract_participant_info(g, fallback_df=raw_df))
participants_df = pd.concat(parts, ignore_index=True)
participants_df

Unnamed: 0,participant_id,group,eyetracker_filename,results_time
0,60d26e7cd9f0761e4d12b9f8,a,httpsfarmpcibexnetpzCPVqO/e48687db-1afc-905a-9...,2025-09-22 16:33:59+00:00
1,6108da57e362f96a3ee32a88,b,httpsfarmpcibexnetpzCPVqO/9f995578-7c21-5dd7-3...,2025-09-22 17:12:07+00:00
2,5dade76a4860f70017f70ec5,c,httpsfarmpcibexnetpzCPVqO/1a69c7d4-6bc1-013f-8...,2025-09-22 18:23:58+00:00
3,6151b07ac0d164fdd7e53100,a,httpsfarmpcibexnetpzCPVqO/385ec604-d53c-5512-e...,2025-09-24 17:17:21+00:00
4,5f4c042383588080d02e61a3,b,httpsfarmpcibexnetpzCPVqO/7477329b-3787-b3ba-8...,2025-09-24 19:43:41+00:00
5,5f338ba6ea047119dbd6e49e,c,httpsfarmpcibexnetpzCPVqO/329133cc-4271-bc2c-5...,2025-09-25 06:25:58+00:00
6,5e3b29dc87243b34bde5abfa,a,httpsfarmpcibexnetpzCPVqO/c82d96bc-20aa-db20-9...,2025-09-25 08:49:56+00:00
7,6154d933a58bf7bcd9e81fed,b,httpsfarmpcibexnetpzCPVqO/7aa9138e-dc0b-fb8e-b...,2025-09-25 09:33:24+00:00
8,5d4fe6a2ffbcf800019d5e54,c,httpsfarmpcibexnetpzCPVqO/b0598184-0810-4d35-b...,2025-09-25 17:42:12+00:00
9,60cb4cd6477b2ff7c1adaea4,a,httpsfarmpcibexnetpzCPVqO/445006f5-4422-6c8b-6...,2025-09-25 21:45:59+00:00


In [5]:
# Remove certain participants
# No ET data: 641379405684937e6fad9f1b
# Too long times: 642b35c70771761602e9c3ae
# Duplicate test: 5dade76a4860f70017f70ec5_2
list_of_participants_to_remove = ['642b35c70771761602e9c3ae', '641379405684937e6fad9f1b', '5dade76a4860f70017f70ec5_2', 'parti_test_01'] ####################################################################
participants_df = participants_df[~participants_df['participant_id'].isin(list_of_participants_to_remove)]
df = df[~df['participant_id'].isin(list_of_participants_to_remove)]

participants_df

Unnamed: 0,participant_id,group,eyetracker_filename,results_time
0,60d26e7cd9f0761e4d12b9f8,a,httpsfarmpcibexnetpzCPVqO/e48687db-1afc-905a-9...,2025-09-22 16:33:59+00:00
1,6108da57e362f96a3ee32a88,b,httpsfarmpcibexnetpzCPVqO/9f995578-7c21-5dd7-3...,2025-09-22 17:12:07+00:00
2,5dade76a4860f70017f70ec5,c,httpsfarmpcibexnetpzCPVqO/1a69c7d4-6bc1-013f-8...,2025-09-22 18:23:58+00:00
3,6151b07ac0d164fdd7e53100,a,httpsfarmpcibexnetpzCPVqO/385ec604-d53c-5512-e...,2025-09-24 17:17:21+00:00
4,5f4c042383588080d02e61a3,b,httpsfarmpcibexnetpzCPVqO/7477329b-3787-b3ba-8...,2025-09-24 19:43:41+00:00
5,5f338ba6ea047119dbd6e49e,c,httpsfarmpcibexnetpzCPVqO/329133cc-4271-bc2c-5...,2025-09-25 06:25:58+00:00
6,5e3b29dc87243b34bde5abfa,a,httpsfarmpcibexnetpzCPVqO/c82d96bc-20aa-db20-9...,2025-09-25 08:49:56+00:00
7,6154d933a58bf7bcd9e81fed,b,httpsfarmpcibexnetpzCPVqO/7aa9138e-dc0b-fb8e-b...,2025-09-25 09:33:24+00:00
8,5d4fe6a2ffbcf800019d5e54,c,httpsfarmpcibexnetpzCPVqO/b0598184-0810-4d35-b...,2025-09-25 17:42:12+00:00
9,60cb4cd6477b2ff7c1adaea4,a,httpsfarmpcibexnetpzCPVqO/445006f5-4422-6c8b-6...,2025-09-25 21:45:59+00:00


In [6]:
# Value counts for group
participants_df['group'].value_counts(dropna=False)

group
a    10
b     9
c     9
Name: count, dtype: int64

## Longform events

In [7]:
# Filter for experiment/practice trials only
df_trials = df[df['label'].isin(['experiment', 'practice'])].copy()

# Sort for deterministic grouping
df_trials = df_trials.sort_values(['participant_id', 'no', 'item'])

# Region RTs: r1..r7
region_names = [f"r{i}" for i in range(1, 8)]
is_region = df_trials['PennElementName'].str.lower().isin(region_names) & df_trials['Parameter'].str.lower().eq('pressedkey')
regions = df_trials[is_region].copy()
regions['region_idx'] = regions['PennElementName'].str.extract(r'r(\d)')[0].astype(int)

# Build a full trial index to ensure all trials are present
# NOTE: added 'exp' and 'target' to match new appended columns ordering
trial_index_cols = ['results_time', 'participant_id', 'group', 'trial', 'label', 'no', 'item', 'exp', 'condition', 'cb', 'left', 'right', 'target']
# Ensure trial_index_cols exist in df_trials before selecting (EXPECTED_FIELDS earlier guarantees presence)
trial_index = df_trials.drop_duplicates(subset=[c for c in trial_index_cols if c in df_trials.columns])[[c for c in trial_index_cols if c in df_trials.columns]].sort_values([c for c in trial_index_cols if c in df_trials.columns])

# Compute region RTs per trial (item) using average of elapsed_ms
region_mean = regions.pivot_table(
    index=[c for c in trial_index_cols if c in regions.columns],
    columns='region_idx',
    values='elapsed_ms',
    aggfunc='mean',
    fill_value=np.nan
)

# Reindex to include any trials that had no region rows
region_mean = region_mean.reindex(trial_index.set_index([c for c in trial_index_cols if c in trial_index.columns]).index, fill_value=np.nan)

# Rename numeric region columns to r1..r7
region_mean = region_mean.rename(columns={i: f"r{int(i)}" for i in region_mean.columns})

# Question RT: average elapsed_ms for question keypress
is_question = df_trials['PennElementName'].str.lower().eq('question') & df_trials['Parameter'].str.lower().eq('pressedkey')
questions = df_trials[is_question].copy()
questions = questions.sort_values(['participant_id', 'no', 'item'])
questions['question_rt'] = questions['elapsed_ms']
question_rt = questions.groupby([c for c in trial_index_cols if c in questions.columns])['question_rt'].mean()

# Choice RT and value: average elapsed_ms for choice selection
is_choice = df_trials['PennElementType'].str.lower().eq('selector') & df_trials['Parameter'].str.lower().eq('selection')
choices = df_trials[is_choice].copy()
choices = choices.sort_values(['MD5', 'participant_id', 'no', 'item'])
choices['choice_rt'] = choices['elapsed_ms']
choices_value = choices.groupby([c for c in trial_index_cols if c in choices.columns])['Value'].first()
choice_rt = choices.groupby([c for c in trial_index_cols if c in choices.columns])['choice_rt'].mean()

# Assemble longform DataFrame
longform = region_mean.copy()

longform['question_rt'] = question_rt
longform['choice_rt'] = choice_rt
longform['choice'] = choices_value

# Rename choice values from left_canvas/right_canvas to left/right
longform['choice'] = longform['choice'].replace({"left_canvas": "left", "right_canvas": "right"})

# Subtract fixed offsets (keep units consistent with elapsed_ms)
longform['question_rt'] = longform['question_rt'] - 1000
longform['choice_rt'] = longform['choice_rt'] - 2000

# Sort by participant and trial for deterministic ordering
# If 'trial' missing in index columns fallback to sorting by ['participant_id', 'no', 'item']
sort_cols = ['participant_id', 'trial'] if 'trial' in longform.index.names else ['participant_id', 'no', 'item']
longform = longform.sort_values(by=list(sort_cols), ascending=[True]*len(sort_cols))

longform = longform.reset_index()
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r1,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice
0,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,6,practice,990,1,,practice,n,...,5692.0,2251.0,1551.0,1366.0,1200.0,1379.0,1429.0,,3343.0,left
1,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,7,practice,991,2,,practice,y,...,1349.0,1314.0,1077.0,901.0,901.0,922.0,900.0,,2227.0,right
2,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,8,practice,992,3,,practice,n,...,1017.0,972.0,1032.0,870.0,815.0,1278.0,1240.0,2116.0,2256.0,left
3,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,10,experiment,15,5,1,contrastive,n,...,1239.0,947.0,1051.0,762.0,845.0,943.0,1159.0,,3306.0,left
4,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,11,experiment,94,28,2,other-directed-x,y,...,836.0,902.0,843.0,895.0,1114.0,,1316.0,3357.0,3077.0,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,45,experiment,63,21,2,self-directed,n,...,499.0,518.0,608.0,648.0,813.0,,1254.0,1335.0,3216.0,right
1219,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,46,experiment,96,29,2,other-directed,y,...,710.0,908.0,806.0,812.0,1006.0,,1622.0,797.0,2985.0,left
1220,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,467.0,521.0,573.0,521.0,512.0,810.0,1227.0,,17275.0,left
1221,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,48,experiment,75,24,2,self-directed,n,...,808.0,780.0,606.0,750.0,887.0,,1484.0,791.0,2782.0,right


## Attention checks

In [8]:
# # Filter longform for attention-check condition
# attention_check_rows = longform[longform['condition'] == 'attention-check']
# # Show rows where choice does not match target
# failed_attention_checks = attention_check_rows[attention_check_rows['choice'] != attention_check_rows['target']]
# failed_attention_checks

# Refined attention check: match choice to the column (left or right) that matches target
def check_attention(row):
    if row['choice'] == 'left':
        return row['target'] == row['left']
    elif row['choice'] == 'right':
        return row['target'] == row['right']
    return False

attention_check_rows = longform[longform['condition'] == 'attention-check'].copy()
attention_check_rows['attention_pass'] = attention_check_rows.apply(check_attention, axis=1)

# Show rows where attention check failed
failed_attention_checks = attention_check_rows[~attention_check_rows['attention_pass']]
failed_attention_checks

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,attention_pass
604,2025-09-25 21:45:59+00:00,60cb4cd6477b2ff7c1adaea4,a,44,experiment,111,33,,attention-check,n,...,244.0,206.0,201.0,346.0,,371.0,,2958.0,right,False


## Calculate expectations/mismatch

In [9]:
# Find rows where target is available and does not match the chosen side's value
def choice_matches_target(row):
    if pd.isna(row['choice']) or pd.isna(row['target']):
        return True  # skip if missing
    if row['choice'] == 'left':
        return row['target'] == row['left']
    elif row['choice'] == 'right':
        return row['target'] == row['right']
    return False

mismatch_rows = longform[
    longform['target'].notna() &
    longform['choice'].notna() &
    (~longform.apply(choice_matches_target, axis=1))
].copy()

# Filter mismatch_rows to exclude any rows where 'target' is missing or empty string
mismatch_rows = mismatch_rows[mismatch_rows['target'].notna() & (mismatch_rows['target'] != '')].copy()
mismatch_rows

# Add a new column 'mismatch' to the longform to indicate these items where the choice does not match the target
longform['mismatch'] = False
longform.loc[mismatch_rows.index, 'mismatch'] = True
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,mismatch
0,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,6,practice,990,1,,practice,n,...,2251.0,1551.0,1366.0,1200.0,1379.0,1429.0,,3343.0,left,False
1,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,7,practice,991,2,,practice,y,...,1314.0,1077.0,901.0,901.0,922.0,900.0,,2227.0,right,False
2,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,8,practice,992,3,,practice,n,...,972.0,1032.0,870.0,815.0,1278.0,1240.0,2116.0,2256.0,left,False
3,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,10,experiment,15,5,1,contrastive,n,...,947.0,1051.0,762.0,845.0,943.0,1159.0,,3306.0,left,True
4,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,11,experiment,94,28,2,other-directed-x,y,...,902.0,843.0,895.0,1114.0,,1316.0,3357.0,3077.0,right,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,45,experiment,63,21,2,self-directed,n,...,518.0,608.0,648.0,813.0,,1254.0,1335.0,3216.0,right,False
1219,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,46,experiment,96,29,2,other-directed,y,...,908.0,806.0,812.0,1006.0,,1622.0,797.0,2985.0,left,True
1220,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,521.0,573.0,521.0,512.0,810.0,1227.0,,17275.0,left,False
1221,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,48,experiment,75,24,2,self-directed,n,...,780.0,606.0,750.0,887.0,,1484.0,791.0,2782.0,right,False


In [10]:
# Print how many items I have for each condition
print(longform['mismatch'].value_counts())

mismatch
False    1041
True      182
Name: count, dtype: int64


## Choosing experiment

In [11]:
# Remove all rows where exp is 2 from trial_index
longform = longform[longform['exp'] != '2'].copy()
longform

# Reorder by custom condition order: practice, attention-check, exclusive, focus, contrastive
custom_order = ['practice', 'attention-check', 'exclusive', 'focus', 'contrastive']
existing = longform['condition'].dropna().astype(str).unique().tolist()
remaining = [c for c in existing if c not in custom_order]
full_order = custom_order + remaining

# Make ordered categorical and then sort by it (plus any secondary keys).
longform['condition'] = pd.Categorical(longform['condition'].astype(str), categories=full_order, ordered=True)
longform = longform.sort_values(by=['condition', 'participant_id', 'no'], na_position='last').reset_index(drop=True)
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,mismatch
0,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,6,practice,990,1,,practice,n,...,2251.0,1551.0,1366.0,1200.0,1379.0,1429.0,,3343.0,left,False
1,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,7,practice,991,2,,practice,y,...,1314.0,1077.0,901.0,901.0,922.0,900.0,,2227.0,right,False
2,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,8,practice,992,3,,practice,n,...,972.0,1032.0,870.0,815.0,1278.0,1240.0,2116.0,2256.0,left,False
3,2025-10-06 21:02:00+00:00,5b93d1913dca6000012c5fdc,c,6,practice,990,1,,practice,n,...,2455.0,1778.0,1791.0,1468.0,1839.0,1928.0,,3828.0,left,False
4,2025-10-06 21:02:00+00:00,5b93d1913dca6000012c5fdc,c,7,practice,991,2,,practice,y,...,1116.0,788.0,516.0,488.0,808.0,540.0,,3081.0,right,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,521.0,573.0,521.0,512.0,810.0,1227.0,,17275.0,left,False
696,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,49,experiment,21,7,1,contrastive,n,...,495.0,456.0,539.0,636.0,1055.0,884.0,,6009.0,right,False
697,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,43,experiment,30,10,1,contrastive,y,...,791.0,721.0,543.0,517.0,717.0,870.0,,7092.0,left,False
698,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,20,experiment,39,13,1,contrastive,n,...,693.0,803.0,1143.0,620.0,1227.0,909.0,,4445.0,left,True


## Expectations

In [12]:
import plotly.express as px

# Calculate mismatch ratio per condition
mismatch_counts = longform.groupby('condition')['mismatch'].agg(['sum', 'count'])
mismatch_counts['mismatch_ratio'] = mismatch_counts['sum'] / mismatch_counts['count']

# Prepare data for donut chart
donut_data = []
for cond, row in mismatch_counts.iterrows():
    donut_data.append({'condition': cond, 'type': 'Mismatch', 'count': row['sum']})
    donut_data.append({'condition': cond, 'type': 'Match', 'count': row['count'] - row['sum']})

donut_df = pd.DataFrame(donut_data)

fig = px.pie(
    donut_df,
    names='type',
    values='count',
    color='type',
    facet_col='condition',
    hole=0.5,
    title='Expectation Mismatch Ratio per Condition',
    color_discrete_map={'Mismatch': 'crimson', 'Match': 'lightgray'}
)
fig.update_traces(textinfo='percent')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

# Save as png and html
fig.write_image(plots / "mismatch_donut_per_condition.png", width=1200, height=400)
fig.write_html(plots / "mismatch_donut_per_condition.html", include_plotlyjs='cdn')

  mismatch_counts = longform.groupby('condition')['mismatch'].agg(['sum', 'count'])


## Reading Times

In [13]:
# Replot: every item as an individual line, faceted by condition, color by participant
import plotly.express as px

# Rebuild melted if needed (one row per participant × item × region)
region_cols = [c for c in longform.columns if re.match(r"r\d+$", c)]
plot_df = longform.melt(
    id_vars=['participant_id', 'no', 'condition'],
    value_vars=region_cols,
    var_name='region',
    value_name='reading_time'
).dropna(subset=['reading_time'])

# Ensure region order
full_region_order = [f"r{i}" for i in range(1, 8)]
plot_df['region'] = pd.Categorical(plot_df['region'], categories=full_region_order, ordered=True)

# Create a unique line id per item (so each item is its own trace) and keep color = participant
plot_df['pid_item'] = plot_df['participant_id'].astype(str) + ' | no ' + plot_df['no'].astype(str)

fig = px.line(
    plot_df,
    x='region',
    y='reading_time',
    color='participant_id',        # color by participant
    line_group='pid_item',         # each item -> separate connected line
    facet_col='condition',
    # facet_col_wrap=2,              # <- two columns of facets
    category_orders={'region': full_region_order},
    markers=True,
    template='plotly_white',       
    title='Reading times per item (each item = one line), faceted by condition, colored by participant',
    labels={'reading_time': 'Reading time (ms)', 'region': 'Region'}
)

fig.update_traces(mode='lines+markers', marker={'size':4}, opacity=0.7, hovertemplate=None)
fig.update_layout(legend_title_text='Participant', height=400)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # simplify facet labels
fig.show()

# Save image as png and html
fig.write_image(plots / "reading_times_raw.png", scale=3)
fig.write_html(plots / "reading_times_raw.html", include_plotlyjs='cdn')


In [14]:
# Remove practice and attention checks
longform = longform[longform['exp'].astype(str).eq('1')].copy()
longform

region_idx,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice,mismatch
196,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,36,experiment,1,1,1,exclusive,n,...,532.0,556.0,696.0,584.0,756.0,906.0,,1745.0,left,False
197,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,43,experiment,10,4,1,exclusive,y,...,697.0,574.0,559.0,585.0,620.0,641.0,,2028.0,right,False
198,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,49,experiment,19,7,1,exclusive,n,...,604.0,524.0,521.0,575.0,678.0,510.0,,1831.0,left,False
199,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,45,experiment,28,10,1,exclusive,y,...,578.0,593.0,603.0,508.0,715.0,677.0,,1807.0,right,False
200,2025-10-08 11:45:02+00:00,59679c319febf80001d53655,a,38,experiment,37,13,1,exclusive,n,...,587.0,542.0,606.0,658.0,648.0,864.0,,1902.0,left,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,521.0,573.0,521.0,512.0,810.0,1227.0,,17275.0,left,False
696,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,49,experiment,21,7,1,contrastive,n,...,495.0,456.0,539.0,636.0,1055.0,884.0,,6009.0,right,False
697,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,43,experiment,30,10,1,contrastive,y,...,791.0,721.0,543.0,517.0,717.0,870.0,,7092.0,left,False
698,2025-09-29 13:59:02+00:00,67698e94c727a4a942390c57,c,20,experiment,39,13,1,contrastive,n,...,693.0,803.0,1143.0,620.0,1227.0,909.0,,4445.0,left,True


In [16]:
import plotly.express as px

# Melt longform to one row per participant × item × region 
region_cols = [c for c in longform.columns if re.match(r"r\d+$", c)]
melted = (
    longform
    .melt(id_vars=['participant_id', 'item', 'condition'],
          value_vars=region_cols,
          var_name='region',
          value_name='reading_time')
    .dropna(subset=['reading_time'])
)

# Per-participant (across items) mean and std for each region × condition
pp_region_stats = (
    melted
    .groupby(['participant_id', 'condition', 'region'], as_index=False, observed=True)
    .reading_time.agg(reading_time_mean='mean', reading_time_std='std')
)

# Per-participant question/choice mean+std across items (if present)
extras_pp = []
if 'question_rt' in longform.columns:
    q_pp = (
        longform
        .dropna(subset=['question_rt'])
        .groupby(['participant_id', 'condition'], as_index=False, observed=True)
        .question_rt.agg(reading_time_mean='mean', reading_time_std='std')
    )
    q_pp['region'] = 'question'
    extras_pp.append(q_pp[['participant_id','condition','region','reading_time_mean','reading_time_std']])

if 'choice_rt' in longform.columns:
    c_pp = (
        longform
        .dropna(subset=['choice_rt'])
        .groupby(['participant_id', 'condition'], as_index=False, observed=True)
        .choice_rt.agg(reading_time_mean='mean', reading_time_std='std')
    )
    c_pp['region'] = 'choice'
    extras_pp.append(c_pp[['participant_id','condition','region','reading_time_mean','reading_time_std']])

if extras_pp:
    pp_region_stats = pd.concat([pp_region_stats] + extras_pp, ignore_index=True, sort=False)

# Only show present conditions
present_conditions = pp_region_stats['condition'].dropna().unique().tolist()

# Now aggregate across participants: use participant-level means to compute group mean and across-participant std
present_regions = pp_region_stats['region'].unique().tolist()
full_region_order = [f"r{i}" for i in range(1, 8)]
present_regions = [r for r in full_region_order if r in present_regions] + [r for r in ['question','choice'] if r in pp_region_stats['region'].unique()]

agg_plot = (
    pp_region_stats
    .groupby(['region', 'condition'], as_index=False, observed=True)
    .reading_time_mean.agg(mean_reading_time='mean', std_reading_time='std')
)

# --- INSERT FALLBACK STD COMPUTATION HERE ---
std_items = melted.groupby(['condition', 'region'], observed=True)['reading_time'].std().reset_index().rename(columns={'reading_time':'std_items'})

if 'question_rt' in longform.columns:
    std_q = longform.dropna(subset=['question_rt']).groupby('condition', observed=True)['question_rt'].std().reset_index().rename(columns={'question_rt':'std_items'})
    std_q['region'] = 'question'
    std_items = pd.concat([std_items, std_q[['condition','region','std_items']]], ignore_index=True)

if 'choice_rt' in longform.columns:
    std_c = longform.dropna(subset=['choice_rt']).groupby('condition', observed=True)['choice_rt'].std().reset_index().rename(columns={'choice_rt':'std_items'})
    std_c['region'] = 'choice'
    std_items = pd.concat([std_items, std_c[['condition','region','std_items']]], ignore_index=True)

agg_plot = agg_plot.merge(std_items, on=['condition', 'region'], how='left')
agg_plot['std_reading_time'] = agg_plot['std_reading_time'].fillna(agg_plot['std_items'])
agg_plot.drop(columns=['std_items'], inplace=True)
# --- END INSERT ---

# Keep only present regions and set categorical ordering
agg_plot = agg_plot[agg_plot['region'].isin(present_regions) & agg_plot['condition'].isin(present_conditions)].copy()
agg_plot['region'] = pd.Categorical(agg_plot['region'], categories=present_regions, ordered=True)
agg_plot['condition'] = pd.Categorical(agg_plot['condition'], categories=present_conditions, ordered=True)

# Plot 1: Reading times for regions (r1..r7) — error bars are across-participant std of participant means
region_only = agg_plot[agg_plot['region'].isin(full_region_order)]
fig1 = px.line(
    region_only,
    x='region',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    facet_col='condition',
    category_orders={'region': full_region_order, 'condition': present_conditions},
    title='Mean Reading Times per Region<br>(Error Bars = Across-Participant Std Dev, Faceted by Condition)',
    labels={'mean_reading_time': 'Mean Reading Time (ms)', 'region': 'Region'}
)
fig1.update_traces(connectgaps=True, mode='lines+markers', marker={'size': 6}, line={'width': 2}, opacity=0.85)
fig1.update_yaxes(title_text='Mean reading time (ms)')
fig1.update_xaxes(title_text='Region')
fig1.update_layout(legend_title_text='Condition')
fig1.show()

# # Plot 2: Question RTs
# question_only = agg_plot[agg_plot['region'] == 'question']
# fig2 = px.bar(
#     question_only,
#     x='condition',
#     y='mean_reading_time',
#     error_y='std_reading_time',
#     color='condition',
#     category_orders={'condition': present_conditions},
#     title='Mean Question RT by Condition (Error Bars = Std Dev)',
#     labels={'mean_reading_time': 'Mean Question RT (ms)', 'condition': 'Condition'}
# )
# fig2.update_layout(legend_title_text='Condition')
# fig2.show()

# Plot 3: Decision RTs
choice_only = agg_plot[agg_plot['region'] == 'choice']
fig3 = px.bar(
    choice_only,
    x='condition',
    y='mean_reading_time',
    error_y='std_reading_time',
    color='condition',
    category_orders={'condition': present_conditions},
    title='Mean Decision RT by Condition (Error Bars = Std Dev)',
    labels={'mean_reading_time': 'Mean Decision RT (ms)', 'condition': 'Condition'}
)
fig3.update_layout(legend_title_text='Condition')
fig3.show()

In [17]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# Determine three_conditions (prefer canonical)
preferred = ['exclusive', 'focus', 'contrastive']
present_conditions = agg_plot['condition'].dropna().astype(str).unique().tolist()
three_conditions = [c for c in preferred if c in present_conditions]
if len(three_conditions) < 3:
    three_conditions = present_conditions[:3]

# Regions (exclude 'choice' from region facets)
full_region_order = [f"r{i}" for i in range(1, 8)]
region_order = full_region_order[:]  # r1..r7 only

# Data frames
region_df = agg_plot[agg_plot['region'].isin(region_order) & agg_plot['condition'].isin(three_conditions)].copy()
region_df['region'] = pd.Categorical(region_df['region'], categories=region_order, ordered=True)

choice_df = agg_plot[agg_plot['region'].astype(str) == 'choice']
choice_df = choice_df[choice_df['condition'].isin(three_conditions)].copy()

# Build color map for conditions so region traces and choice bars share colors
palette = px.colors.qualitative.Plotly  # stable qualitative palette
color_map = {cond: palette[i % len(palette)] for i, cond in enumerate(three_conditions)}

# --- Calculate global min/max for y axis from all region_df and choice_df mean_reading_time + std_reading_time values ---
all_y = pd.concat([
    region_df['mean_reading_time'] + region_df['std_reading_time'].fillna(0),
    choice_df['mean_reading_time'] + choice_df['std_reading_time'].fillna(0)
]).dropna()
ymin = float(pd.concat([
    region_df['mean_reading_time'] - region_df['std_reading_time'].fillna(0),
    choice_df['mean_reading_time'] - choice_df['std_reading_time'].fillna(0)
]).min()) if not all_y.isnull().all() else 0
ymax = float(all_y.max()) if not all_y.isnull().all() else 1
ypad = (ymax - ymin) * 0.15 if (ymax - ymin) > 0 else 10  # Increase padding for error bars
ymin = max(0, ymin - ypad)
ymax = ymax + ypad

# Build subplots: one column per condition + one final column for 'choice'
n_cols = len(three_conditions) + 1
subplot_titles = [f"{c}" for c in three_conditions] + ['choice']
fig = make_subplots(rows=1, cols=n_cols, shared_yaxes=True, subplot_titles=subplot_titles)

# Add region lines to each condition column (first len(three_conditions) columns)
for ci, cond in enumerate(three_conditions, start=1):
    d = region_df[region_df['condition'] == cond].sort_values('region')
    if d.empty:
        continue
    cond_color = color_map.get(cond)
    fig.add_trace(
        go.Scatter(
            x=d['region'].astype(str),
            y=d['mean_reading_time'],
            mode='lines+markers',
            name=cond,
            error_y=dict(type='data', array=d['std_reading_time'].values if 'std_reading_time' in d else None),
            marker=dict(size=6, color=cond_color),
            line=dict(width=2, color=cond_color)
        ),
        row=1, col=ci
    )
    # tidy axes
    fig.update_xaxes(categoryorder='array', categoryarray=region_order, row=1, col=ci)
    fig.update_yaxes(title_text='Mean reading time (ms)', row=1, col=ci, range=[ymin, ymax])

# Add 'choice' bars in the final column (one bar per condition) using same colors
choice_col = n_cols
if not choice_df.empty:
    # ensure order of conditions
    choice_df['condition'] = pd.Categorical(choice_df['condition'].astype(str), categories=three_conditions, ordered=True)
    for cond in three_conditions:
        d = choice_df[choice_df['condition'] == cond]
        if d.empty:
            continue
        cond_color = color_map.get(cond)
        fig.add_trace(
            go.Bar(
                x=[cond],
                y=[d['mean_reading_time'].iloc[0]],
                name=cond,
                marker_color=cond_color,
                error_y=dict(type='data', array=[d['std_reading_time'].iloc[0]] if 'std_reading_time' in d else None),
            ),
            row=1, col=choice_col
        )
    fig.update_xaxes(title_text='Condition', row=1, col=choice_col)
    fig.update_yaxes(title_text='Mean reading time (ms)', row=1, col=choice_col, range=[ymin, ymax])

# Layout
fig.update_layout(
    template='plotly_white',
    showlegend=False,
    title='Mean RTs: regions (r1..r7) faceted by condition; choice RTs as separate facet',
    # height=450,
    # width=max(1000, 300 * n_cols)
)

fig.show()

# Save as png and html
fig.write_image(plots / "reading_times.png", scale=3)
fig.write_html(plots / "reading_times.html", include_plotlyjs='cdn')

## Choices

In [18]:
# Calculate, for each condition, the ratio of choices for 'a' type and 'b' type sides
def get_side_type(value):
    # Returns 'a' if value ends with 'a', 'b' if ends with 'b', else None
    if isinstance(value, str):
        if value.endswith('a'):
            return 'a'
        elif value.endswith('b'):
            return 'b'
    return None

# For each row, determine the type ('a' or 'b') of the chosen side
def chosen_side_type(row):
    if row['choice'] == 'left':
        return get_side_type(row['left'])
    elif row['choice'] == 'right':
        return get_side_type(row['right'])
    return None

longform['chosen_type'] = longform.apply(chosen_side_type, axis=1)

# Group by condition and count choices for each type
type_counts = (
    longform
    .groupby(['condition', 'chosen_type'])
    .size()
    .unstack(fill_value=0)
)

# Calculate ratio for each type per condition
type_ratios = type_counts.div(type_counts.sum(axis=1), axis=0)

print("Counts per condition:")
print(type_counts)
print("\nRatios per condition:")
print(type_ratios)

Counts per condition:
chosen_type        a    b
condition                
practice           0    0
attention-check    0    0
exclusive        158   10
focus            158   10
contrastive       66  102

Ratios per condition:
chosen_type             a         b
condition                          
practice              NaN       NaN
attention-check       NaN       NaN
exclusive        0.940476  0.059524
focus            0.940476  0.059524
contrastive      0.392857  0.607143






In [19]:
import plotly.express as px

# Prepare data for donut chart: one donut per condition, split by chosen_type
donut_data = []
for cond in type_counts.index:
    for chosen_type in type_counts.columns:
        count = type_counts.loc[cond, chosen_type]
        donut_data.append({
            'condition': cond,
            'chosen_type': chosen_type if pd.notnull(chosen_type) else 'None',
            'count': count
        })

donut_df = pd.DataFrame(donut_data)

# Filter out 'practice' and 'attention-check' conditions before plotting
exclude_conditions = ['practice', 'attention-check']
plot_conditions = [c for c in type_counts.index if c not in exclude_conditions]
filtered_donut_df = donut_df[donut_df['condition'].isin(plot_conditions)]

fig = px.pie(
    filtered_donut_df,
    names='chosen_type',
    values='count',
    color='chosen_type',
    facet_col='condition',
    hole=0.5,
    title='Choice Ratios of Exclusive (a) or Contrastive (b) types of images per Condition',
    color_discrete_map={'a': 'royalblue', 'b': 'orange', 'None': 'lightgray'}
)
fig.update_traces(textinfo='percent+label')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

# Save as png and html
fig.write_image(plots / "choice_type_donut_per_condition.png", width=1200, height=400)
fig.write_html(plots / "choice_type_donut_per_condition.html", include_plotlyjs='cdn')

## Eye-tracking

In [20]:
import pandas as pd
import requests
from io import StringIO

def download_and_save_eyetracking_data():
    out_dir = 'eyetracking_data'
    eturl = "https://mondo1.dreamhosters.com/script.php?experiment="
    for _, row in participants_df.iterrows():
        et_filename = row['eyetracker_filename']
        participant_id = row['participant_id']
        if pd.notnull(et_filename) and pd.notnull(participant_id):
            et_file = eturl + et_filename
            try:
                r = requests.get(et_file, timeout=15)
                r.raise_for_status()
                df_et = pd.read_csv(StringIO(r.text))
                df_et.to_csv(f"{out_dir}/{participant_id}.csv", index=False)
            except Exception as e:
                print(f"Failed for {participant_id}: {e}")
    print("Done!")
# Example usage:
download_and_save_eyetracking_data()

Done!


In [21]:
# Fix eyetracking csv trial numbering
import os
import pandas as pd
from glob import glob

et_dir = os.path.join(ROOT, 'eyetracking_data')

for csv_file in glob(os.path.join(et_dir, '*.csv')):
    df = pd.read_csv(csv_file)
    # Apply fix_trial_numbering WITHOUT changing row order
    trial_col = 'trial' if 'trial' in df.columns else [c for c in df.columns if 'trial' in c][0]
    # Map 9,10,11 to 6,7,8
    mapping = {9: 6, 10: 7, 11: 8}
    df[trial_col] = df[trial_col].replace(mapping)
    # Find the index of the last occurrence of 8
    last_mapped_idx = df[df[trial_col] == 8].index.max()
    if last_mapped_idx is not None and last_mapped_idx + 1 < len(df):
        next_trial = 10
        prev_trial_val = None
        for i in range(last_mapped_idx + 1, len(df)):
            current_trial_val = df.at[i, trial_col]
            if prev_trial_val is not None and current_trial_val != prev_trial_val:
                next_trial += 1
            df.at[i, trial_col] = next_trial
            prev_trial_val = current_trial_val
    df.to_csv(csv_file, index=False)
print("All eyetracking files fixed and overwritten.")

All eyetracking files fixed and overwritten.


In [22]:
# Eye tracking data with filtering out neutral gaze
import os
import pandas as pd
import numpy as np
import plotly.express as px
from glob import glob

# Directory containing eyetracking data
et_dir = os.path.join(ROOT, 'eyetracking_data')

# Helper to parse a single participant's eyetracking file
def parse_eyetracking_file(filepath):
    df = pd.read_csv(filepath)
    # df = fix_trial_numbering(df).copy()
    df.columns = [c.lower() for c in df.columns]
    # Drop rows where both left and right canvas are 0 (neutral gaze) 
    # df = df[~((df['_left_canvas'] == 0) & (df['_right_canvas'] == 0))].copy()
    # Identify trial number
    if 'trial' in df.columns:
        trial_col = 'trial'
    else:
        trial_col = [c for c in df.columns if 'trial' in c][0]
    return df, trial_col

# Aggregate results for all participants except those to remove
et_results = []
for csv_file in glob(os.path.join(et_dir, '*.csv')):
    participant_id = os.path.splitext(os.path.basename(csv_file))[0]
    if participant_id in list_of_participants_to_remove:
        continue
    df, trial_col = parse_eyetracking_file(csv_file)
    for trial, g in df.groupby(trial_col):
        dwell_left = g.loc[g['_left_canvas'] == 1, 'times'].max() / 1000
        dwell_right = g.loc[g['_right_canvas'] == 1, 'times'].max() / 1000
        total_dwell = dwell_left + dwell_right
        prop_left = dwell_left / total_dwell if total_dwell > 0 else np.nan
        prop_right = dwell_right / total_dwell if total_dwell > 0 else np.nan
        dominant = 'left' if dwell_left > dwell_right else 'right'
        left_changes = (g['_left_canvas'] != g['_left_canvas'].shift()).astype(int)
        n_fix_left = ((g['_left_canvas'] == 1) & (left_changes == 1)).sum()
        right_changes = (g['_right_canvas'] != g['_right_canvas'].shift()).astype(int)
        n_fix_right = ((g['_right_canvas'] == 1) & (right_changes == 1)).sum()
        canvas_state = np.where(g['_left_canvas'] == 1, 'L', np.where(g['_right_canvas'] == 1, 'R', 'N'))
        transitions = np.sum(canvas_state[1:] != canvas_state[:-1])

        # --- NEW: compute time-to-first-fixation (tff), first-fixation-location (ffl),
        # fixation counts per AOI and revisits (returns to AOI after leaving) ---
        states = canvas_state
        # indices where state changes => segment boundaries
        change_idx = np.where(states[1:] != states[:-1])[0] + 1
        segment_bounds = np.concatenate(([0], change_idx, [len(states)]))
        fixations_left = 0
        fixations_right = 0
        first_fix_idx = None
        for s_start, s_end in zip(segment_bounds[:-1], segment_bounds[1:]):
            seg_state = states[s_start]
            if seg_state == 'L':
                fixations_left += 1
                if first_fix_idx is None:
                    first_fix_idx = s_start
            elif seg_state == 'R':
                fixations_right += 1
                if first_fix_idx is None:
                    first_fix_idx = s_start
        # time to first fixation (seconds) relative to trial start (first timestamp in g)
        if first_fix_idx is not None and len(g) > 0:
            try:
                first_fix_time = g['times'].values[first_fix_idx]
                trial_start_time = g['times'].values[0]
                tff = (first_fix_time - trial_start_time) / 1000.0
            except Exception:
                tff = np.nan
            ffl = 'left' if states[first_fix_idx] == 'L' else 'right' if states[first_fix_idx] == 'R' else None
        else:
            tff = np.nan
            ffl = None

        # revisits = number of returns to an AOI after leaving it = max(0, fixations - 1)
        revisits_left = max(0, fixations_left - 1)
        revisits_right = max(0, fixations_right - 1)

        et_results.append({
            'participant_id': participant_id,
            'trial': trial,
            'dominant': dominant,
            'prop_left': round(prop_left, 2) if not np.isnan(prop_left) else np.nan,
            'prop_right': round(prop_right, 2) if not np.isnan(prop_right) else np.nan,
            'total_dwell': round(total_dwell, 2),
            'dwell_left': dwell_left,
            'dwell_right': dwell_right,
            'n_fix_left': n_fix_left,
            'n_fix_right': n_fix_right,
            'transitions': transitions,
            # added metrics
            'tff': tff,                       # time to first fixation (s)
            'ffl': ffl,                       # first fixation location: 'left' or 'right' (or None)
            'fixations_left': fixations_left, # number of discrete left-AOI fixations
            'fixations_right': fixations_right,# number of discrete right-AOI fixations
            'fixation_count_total': fixations_left + fixations_right,
            'revisits_left': revisits_left,   # returns to left after leaving
            'revisits_right': revisits_right, # returns to right after leaving
            'revisits_total': revisits_left + revisits_right
        })
# ...existing code...
et_df = pd.DataFrame(et_results)

# Remove certain participants
et_df = et_df[~et_df['participant_id'].isin(list_of_participants_to_remove)]
et_df

Unnamed: 0,participant_id,trial,dominant,prop_left,prop_right,total_dwell,dwell_left,dwell_right,n_fix_left,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
0,59679c319febf80001d53655,6,left,0.57,0.43,3.94,2.231,1.707,6,4,17,0.000,right,6,4,10,5,3,8
1,59679c319febf80001d53655,7,right,,,,,1.071,0,2,3,0.187,right,0,2,2,0,1,1
2,59679c319febf80001d53655,8,left,0.73,0.27,1.62,1.187,0.438,1,1,1,0.000,right,1,1,2,0,0,0
3,59679c319febf80001d53655,10,left,0.54,0.46,4.11,2.223,1.886,3,4,11,0.000,right,3,4,7,2,3,5
4,59679c319febf80001d53655,11,right,0.38,0.62,4.84,1.845,3.000,3,6,12,0.000,right,3,6,9,2,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216,67698e94c727a4a942390c57,45,left,0.51,0.49,6.30,3.188,3.116,7,9,24,0.000,left,7,9,16,6,8,14
1217,67698e94c727a4a942390c57,46,left,0.71,0.29,4.16,2.959,1.205,4,5,14,0.000,right,4,5,9,3,4,7
1218,67698e94c727a4a942390c57,47,left,0.53,0.47,30.81,16.214,14.595,15,5,31,0.000,left,15,5,20,14,4,18
1219,67698e94c727a4a942390c57,48,right,0.39,0.61,4.48,1.753,2.731,4,5,11,0.000,right,4,5,9,3,4,7


## Merge ET and behavioral data

In [23]:
# Print count values of participant_id 
print(et_df['participant_id'].value_counts())

participant_id
59679c319febf80001d53655      44
5f9bfbae0517c60102da9e57      44
667f230c31ae00d966dfff6e      44
666d9692dac9d55d3f331401      44
6658c355ad6d072d4f208486      44
615d5311a9fbf1f98aa62a5a      44
6154d933a58bf7bcd9e81fed      44
61545727036ae019fdeea5df      44
6151b07ac0d164fdd7e53100      44
6108da57e362f96a3ee32a88      44
60ed917a71051ceb8ec151c9      44
60cb4cd6477b2ff7c1adaea4      44
60d26e7cd9f0761e4d12b9f8      44
5f5fc58bfcb0db07c6c64d9e      44
5f3013e31c8a690aacb02c31      44
5ee75f3d1a88450293a38aeb      44
5e3b29dc87243b34bde5abfa      44
5c5c785fc9735b00010ced0b      44
5b93d1913dca6000012c5fdc      43
60fc8070ac259dd44fb94150      43
60fd4fbf1a3f1446d576e5d4      43
5f3ce934a12769b771503625      43
6108da57e362f96a3ee32a88_2    43
5f338ba6ea047119dbd6e49e      43
5dade76a4860f70017f70ec5      43
5d4fe6a2ffbcf800019d5e54      43
67698e94c727a4a942390c57      43
5f4c042383588080d02e61a3      42
Name: count, dtype: int64


In [28]:
# Show participant 5c5c785fc9735b00010ced0b rows
et_df[et_df['participant_id'] == '5c5c785fc9735b00010ced0b']

Unnamed: 0,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
12,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,36,experiment,1,1,1,exclusive,n,...,8.0,19.0,0.0,right,8.0,8.0,16.0,7.0,7.0,14.0
13,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,43,experiment,10,4,1,exclusive,y,...,4.0,11.0,0.0,left,3.0,4.0,7.0,2.0,3.0,5.0
14,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,49,experiment,19,7,1,exclusive,n,...,2.0,7.0,0.118,right,2.0,2.0,4.0,1.0,1.0,2.0
15,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,45,experiment,28,10,1,exclusive,y,...,5.0,11.0,0.149,left,1.0,5.0,6.0,0.0,4.0,4.0
16,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,38,experiment,37,13,1,exclusive,n,...,4.0,17.0,0.0,right,8.0,4.0,12.0,7.0,3.0,10.0
17,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,40,experiment,46,16,1,exclusive,y,...,1.0,4.0,0.089,left,2.0,1.0,3.0,1.0,0.0,1.0
180,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,18,experiment,8,3,1,focus,y,...,9.0,24.0,0.0,left,8.0,9.0,17.0,7.0,8.0,15.0
181,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,13,experiment,17,6,1,focus,n,...,0.0,8.0,0.0,left,5.0,0.0,5.0,4.0,0.0,4.0
182,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,35,experiment,26,9,1,focus,y,...,5.0,13.0,0.0,right,4.0,5.0,9.0,3.0,4.0,7.0
183,2025-10-07 09:30:23,5c5c785fc9735b00010ced0b,a,27,experiment,35,12,1,focus,n,...,6.0,17.0,0.048,left,8.0,6.0,14.0,7.0,5.0,12.0


In [24]:
# Overwrite 'trial' values in et_df per participant using the order from region_mean/trial_index
for participant in et_df['participant_id'].unique():
    # Get the ordered list of trial numbers for this participant from region_mean
    # region_mean index: (results_time, participant_id, group, trial, label, no, item, condition, cb, left, right)
    # Extract trial numbers for this participant
    idx = region_mean.index
    participant_trials = [i[3] for i in idx if i[1] == participant]
    
    # Get indices in et_df for this participant
    mask = et_df['participant_id'] == participant
    n_trials = mask.sum()
    
    # Only overwrite if counts match
    if len(participant_trials) == n_trials:
        et_df.loc[mask, 'trial'] = participant_trials
    else:
        print(f"Warning: trial count mismatch for {participant} (region_mean: {len(participant_trials)}, et_df: {n_trials})")


# Merge eyetracking data with longform data on participant_id
et_df = pd.merge(
    longform,
    et_df,
    how='left',
    left_on=['participant_id', 'trial'],
    right_on=['participant_id', 'trial']
)

et_df

# Save et_df as results.xlsx
# Ensure any timezone-aware datetimes are made timezone-naive before writing to Excel
from pandas.api import types as ptypes

for c in et_df.columns:
    if ptypes.is_datetime64tz_dtype(et_df[c]):
        # convert to UTC then remove tz info -> results become tz-naive timestamps
        et_df[c] = et_df[c].dt.tz_convert('UTC').dt.tz_localize(None)

# Save et_df as results.xlsx (now safe)
et_df.to_excel("results.xlsx", index=False)
et_df




is_datetime64tz_dtype is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.DatetimeTZDtype)` instead.



Unnamed: 0,results_time,participant_id,group,trial,label,no,item,exp,condition,cb,...,n_fix_right,transitions,tff,ffl,fixations_left,fixations_right,fixation_count_total,revisits_left,revisits_right,revisits_total
0,2025-10-08 11:45:02,59679c319febf80001d53655,a,36,experiment,1,1,1,exclusive,n,...,2.0,8.0,0.000,right,3.0,2.0,5.0,2.0,1.0,3.0
1,2025-10-08 11:45:02,59679c319febf80001d53655,a,43,experiment,10,4,1,exclusive,y,...,1.0,0.0,0.000,right,0.0,1.0,1.0,0.0,0.0,0.0
2,2025-10-08 11:45:02,59679c319febf80001d53655,a,49,experiment,19,7,1,exclusive,n,...,1.0,4.0,0.000,right,2.0,1.0,3.0,1.0,0.0,1.0
3,2025-10-08 11:45:02,59679c319febf80001d53655,a,45,experiment,28,10,1,exclusive,y,...,1.0,1.0,0.000,right,0.0,1.0,1.0,0.0,0.0,0.0
4,2025-10-08 11:45:02,59679c319febf80001d53655,a,38,experiment,37,13,1,exclusive,n,...,2.0,8.0,0.117,right,3.0,2.0,5.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,47,experiment,12,4,1,contrastive,y,...,5.0,31.0,0.000,left,15.0,5.0,20.0,14.0,4.0,18.0
500,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,49,experiment,21,7,1,contrastive,n,...,5.0,14.0,0.000,right,5.0,5.0,10.0,4.0,4.0,8.0
501,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,43,experiment,30,10,1,contrastive,y,...,9.0,36.0,0.000,left,14.0,9.0,23.0,13.0,8.0,21.0
502,2025-09-29 13:59:02,67698e94c727a4a942390c57,c,20,experiment,39,13,1,contrastive,n,...,5.0,25.0,0.000,right,11.0,5.0,16.0,10.0,4.0,14.0


## Plot ET results

### All

In [25]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import os

transparent_color = 'hsva(0, 0%, 100%, 0)'

# Facet by condition into rows (one condition per row)
conditions = list(longform['condition'].dropna().unique())
n_rows = max(1, len(conditions))

fig = make_subplots(
    rows=n_rows, cols=1,
    shared_xaxes=True,
    subplot_titles=conditions,
    vertical_spacing=0.02
)

# Track per-condition ordering and metadata for categorical y-axes
facet_yticks = {cond: [] for cond in conditions}
facet_tick_meta = {cond: {} for cond in conditions}

for participant in [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]:
    df_part = et_df[et_df['participant_id'] == participant]
    sorted_trials = sorted(
        df_part['trial'].unique(),
        key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
    )
    for trial in sorted_trials:
        lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        if 'item' in lf_row.columns and pd.notna(lf_row['item'].iloc[0]):
            item_label = lf_row['item'].iloc[0]
        else:
            item_label = trial
        label = f"{participant} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        et_file = os.path.join(et_dir, f"{participant}.csv")
        if not os.path.exists(et_file):
            continue
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()
        if g.empty:
            continue

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000
                seg_start_s = start_time / 1000
            else:
                seg_time = 0
                seg_start_s = np.nan

            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                color = 'cornflowerblue'
                side = 'Left'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                color = 'lightgreen'
                side = 'Right'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                color = transparent_color
                side = 'None'
            else:
                color = "red"
                side = 'Both'

            row_idx = conditions.index(cond) + 1
            customdata = [[
                participant,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

# Format y-axis ticks per condition
for i, cond in enumerate(conditions, start=1):
    labels = facet_yticks[cond]

    def format_label(lbl):
        meta = facet_tick_meta[cond].get(lbl, {})
        pid_short = meta.get('pid_short', 'NA')
        choice_val = meta.get('choice', 'NA')
        base = f"{pid_short} · {choice_val}"
        bold = meta.get('dominant_matches', False)
        mismatch = meta.get('mismatch', False)

        if bold:
            base = f"<b>{base}</b>"
        if mismatch:
            inner = f"<span style='color:crimson'>{base}</span>"
        else:
            inner = base
        return inner

    ticktext = [format_label(t) for t in labels]
    fig.update_yaxes(
        categoryorder='array',
        categoryarray=labels,
        tickvals=labels,
        ticktext=ticktext,
        showgrid=False,
        ticks='',
        row=i, col=1
    )

fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

fig.update_layout(
    template='plotly_white',
    barmode='stack',
    bargap=0.0,
    bargroupgap=0.0,
    title='Eye-Tracking: Stacked Dwell Time Segments (Transitions)',
    height=max(4000, 200 * n_rows),
    # width=1000,
    showlegend=True
)

fig.show()

fig.write_html(plots / "dwell_times.html", include_plotlyjs='cdn')
fig.write_image(plots / "dwell_times.png", scale=3)

### One item

In [26]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import os

transparent_color = 'hsva(0, 0%, 100%, 0)'

# Facet by condition into rows (one condition per row)
conditions = list(longform['condition'].dropna().unique())
n_rows = max(1, len(conditions))

fig = make_subplots(
    rows=n_rows, cols=1,
    shared_xaxes=True,
    subplot_titles=conditions,
    vertical_spacing=0.1
)

# Track per-condition ordering and metadata for categorical y-axes
facet_yticks = {cond: [] for cond in conditions}
facet_tick_meta = {cond: {} for cond in conditions}

for participant in [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]:
    df_part = et_df[et_df['participant_id'] == participant]
    sorted_trials = sorted(
        df_part['trial'].unique(),
        key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
    )
    for trial in sorted_trials:
        lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        item_value = lf_row['item'].iloc[0] if 'item' in lf_row.columns else None
        try:
            item_numeric = float(item_value)
        except (TypeError, ValueError):
            item_numeric = None
        if item_numeric != 1:
            continue
        item_label = item_value

        label = f"{participant} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        et_file = os.path.join(et_dir, f"{participant}.csv")
        if not os.path.exists(et_file):
            continue
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()
        if g.empty:
            continue

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000
                seg_start_s = start_time / 1000
            else:
                seg_time = 0
                seg_start_s = np.nan

            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                color = 'cornflowerblue'
                side = 'Left'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                color = 'lightgreen'
                side = 'Right'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                color = transparent_color
                side = 'None'
            else:
                color = "red"
                side = 'Both'

            row_idx = conditions.index(cond) + 1
            customdata = [[
                participant,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

# Format y-axis ticks per condition
for i, cond in enumerate(conditions, start=1):
    labels = facet_yticks[cond]

    def format_label(lbl):
        meta = facet_tick_meta[cond].get(lbl, {})
        pid_short = meta.get('pid_short', 'NA')
        choice_val = meta.get('choice', 'NA')
        base = f"{pid_short} · {choice_val}"
        bold = meta.get('dominant_matches', False)
        mismatch = meta.get('mismatch', False)

        if bold:
            base = f"<b>{base}</b>"
        if mismatch:
            inner = f"<span style='color:crimson'>{base}</span>"
        else:
            inner = base
        return inner

    ticktext = [format_label(t) for t in labels]
    fig.update_yaxes(
        categoryorder='array',
        categoryarray=labels,
        tickvals=labels,
        ticktext=ticktext,
        showgrid=False,
        ticks='',
        row=i, col=1
    )

fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

fig.update_layout(
    template='plotly_white',
    barmode='stack',
    bargap=0.0,
    bargroupgap=0.0,
    title='Eye-Tracking: Stacked Dwell Time Segments (Transitions) per Item',
    height=max(400, 100 * n_rows),
    # width=1000,
    showlegend=True
)

fig.show()

fig.write_html(plots / "dwell_times_per_item.html", include_plotlyjs='cdn')
fig.write_image(plots / "dwell_times_per_item.png", scale=3)

### One participant

In [None]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go
# import numpy as np
# import pandas as pd
# import os

# transparent_color = 'hsva(0, 0%, 100%, 0)'

# # Facet by condition into rows (one condition per row)
# conditions = list(longform['condition'].dropna().unique())
# n_rows = max(1, len(conditions))

# fig = make_subplots(
#     rows=n_rows, cols=1,
#     shared_xaxes=True,
#     subplot_titles=conditions,
#     vertical_spacing=0.1
# )

# # Track per-condition ordering and metadata for categorical y-axes
# facet_yticks = {cond: [] for cond in conditions}
# facet_tick_meta = {cond: {} for cond in conditions}

# participants_available = [p for p in et_df['participant_id'].unique() if p not in list_of_participants_to_remove]
# if not participants_available:
#     raise ValueError("No participants available after filtering.")

# # Set the participant ID you want to show (change this value as needed)
# participant_to_show = '5f037b2150aed428f2b3614a'

# # Fallback: if the requested ID is not present, use the first available participant
# if participant_to_show not in participants_available:
#     print(f"Requested participant {participant_to_show!r} not found. Showing first available participant {participants_available[0]!r} instead.")
#     participant_to_show = participants_available[0]

# for participant in [participant_to_show]:
#     df_part = et_df[et_df['participant_id'] == participant]
#     sorted_trials = sorted(
#         df_part['trial'].unique(),
#         key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x
#     )
#     for trial in sorted_trials:
#         lf_row = longform[(longform['participant_id'] == participant) & (longform['trial'] == trial)]
#         if lf_row.empty:
#             continue
#         cond = lf_row['condition'].iloc[0]
#         if pd.isna(cond) or cond not in conditions:
#             continue

#         if 'item' in lf_row.columns and pd.notna(lf_row['item'].iloc[0]):
#             item_label = lf_row['item'].iloc[0]
#         else:
#             item_label = trial
#         label = f"{participant} - Item {item_label}"
#         if label not in facet_yticks[cond]:
#             facet_yticks[cond].append(label)

#         et_row = df_part[df_part['trial'] == trial]
#         dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
#         choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

#         mismatch_flag = False
#         if 'mismatch' in lf_row.columns:
#             try:
#                 mismatch_flag = bool(lf_row['mismatch'].iloc[0])
#             except Exception:
#                 mismatch_flag = False

#         facet_tick_meta[cond][label] = {
#             'pid_short': str(participant)[:5],
#             'choice': choice if pd.notna(choice) else 'NA',
#             'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
#             'mismatch': mismatch_flag
#         }

#         et_file = os.path.join(et_dir, f"{participant}.csv")
#         if not os.path.exists(et_file):
#             continue
#         df_et = pd.read_csv(et_file)
#         df_et.columns = [c.lower() for c in df_et.columns]
#         trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
#         g = df_et[df_et[trial_col] == trial].copy()
#         if g.empty:
#             continue

#         canvas_state = np.where(
#             (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
#             np.where(
#                 (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
#                 np.where(
#                     (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
#                 )
#             )
#         )
#         transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
#         segment_starts = np.concatenate(([0], transitions_idx))
#         segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
#         for seg_start, seg_end in zip(segment_starts, segment_ends):
#             seg = g.iloc[seg_start:seg_end]
#             if len(seg) > 0:
#                 start_time = seg['times'].iloc[0]
#                 end_time = seg['times'].iloc[-1]
#                 seg_time = (end_time - start_time) / 1000
#                 seg_start_s = start_time / 1000
#             else:
#                 seg_time = 0
#                 seg_start_s = np.nan

#             if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
#                 color = 'cornflowerblue'
#                 side = 'Left'
#             elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
#                 color = 'lightgreen'
#                 side = 'Right'
#             elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
#                 color = transparent_color
#                 side = 'None'
#             else:
#                 color = "red"
#                 side = 'Both'

#             row_idx = conditions.index(cond) + 1
#             customdata = [[
#                 participant,
#                 item_label,
#                 trial,
#                 cond,
#                 side,
#                 seg_start_s,
#                 seg_time,
#                 dominant or 'N/A',
#                 choice or 'N/A',
#                 'Yes' if mismatch_flag else 'No'
#             ]]
#             fig.add_trace(
#                 go.Bar(
#                     x=[seg_time],
#                     y=[label],
#                     name=side,
#                     marker_color=color,
#                     orientation='h',
#                     showlegend=False,
#                     customdata=customdata,
#                     hovertemplate=(
#                         "<b>%{customdata[0]}</b><br>"
#                         "Condition: %{customdata[3]}<br>"
#                         "Item: %{customdata[1]}<br>"
#                         "Trial: %{customdata[2]}<br>"
#                         "Side: %{customdata[4]}<br>"
#                         "Segment start: %{customdata[5]:.2f}s<br>"
#                         "Segment duration: %{x:.2f}s<br>"
#                         "Dominant: %{customdata[7]}<br>"
#                         "Choice: %{customdata[8]}<br>"
#                         "Mismatch: %{customdata[9]}<extra></extra>"
#                     )
#                 ),
#                 row=row_idx, col=1
#             )

# # Format y-axis ticks per condition
# for i, cond in enumerate(conditions, start=1):
#     labels = facet_yticks[cond]

#     def format_label(lbl):
#         meta = facet_tick_meta[cond].get(lbl, {})
#         pid_short = meta.get('pid_short', 'NA')
#         choice_val = meta.get('choice', 'NA')
#         base = f"{pid_short} · {choice_val}"
#         bold = meta.get('dominant_matches', False)
#         mismatch = meta.get('mismatch', False)

#         if bold:
#             base = f"<b>{base}</b>"
#         if mismatch:
#             inner = f"<span style='color:crimson'>{base}</span>"
#         else:
#             inner = base
#         return inner

#     ticktext = [format_label(t) for t in labels]
#     fig.update_yaxes(
#         categoryorder='array',
#         categoryarray=labels,
#         tickvals=labels,
#         ticktext=ticktext,
#         showgrid=False,
#         ticks='',
#         row=i, col=1
#     )

# fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

# fig.update_layout(
#     template='plotly_white',
#     barmode='stack',
#     bargap=0.0,
#     bargroupgap=0.0,
#     title='Eye-Tracking: Stacked Dwell Time Segments (Transitions) per Participant',
#     height=max(400, 150 * n_rows),
#     # width=1000,
#     showlegend=True
# )

# fig.show()

# fig.write_html(plots / "dwell_times_per_part.html", include_plotlyjs='cdn')
# fig.write_image(plots / "dwell_times_per_part.png", scale=3)

In [44]:
def plot_participant_dwell(participant_id,
                           longform_df=None,
                           et_df_df=None,
                           et_dir_path=None,
                           plots_path=None,
                           transparent_color='hsva(0, 0%, 100%, 0)'):
    """
    Plot stacked dwell-time segments for a single participant and save results.
    Returns the Plotly Figure.
    """
    # fall back to globals if not provided
    lf = longform_df if longform_df is not None else globals().get('longform')
    et = et_df_df if et_df_df is not None else globals().get('et_df')
    et_dir_local = et_dir_path if et_dir_path is not None else globals().get('et_dir')
    plots_dir = plots_path if plots_path is not None else globals().get('plots')

    if lf is None or et is None or et_dir_local is None or plots_dir is None:
        raise ValueError("Provide longform_df, et_df_df, et_dir_path and plots_path or ensure globals exist.")

    conditions = list(lf['condition'].dropna().unique())
    n_rows = max(1, len(conditions))

    fig = make_subplots(rows=n_rows, cols=1, shared_xaxes=True, subplot_titles=conditions, vertical_spacing=0.1)

    facet_yticks = {cond: [] for cond in conditions}
    facet_tick_meta = {cond: {} for cond in conditions}

    participants_available = [p for p in et['participant_id'].unique() if p not in globals().get('list_of_participants_to_remove', [])]
    if participant_id not in participants_available:
        raise ValueError(f"Participant id {participant_id!r} not available. Available ids: {participants_available[:10]}")

    df_part = et[et['participant_id'] == participant_id]
    sorted_trials = sorted(df_part['trial'].unique(), key=lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else x)

    for trial in sorted_trials:
        lf_row = lf[(lf['participant_id'] == participant_id) & (lf['trial'] == trial)]
        if lf_row.empty:
            continue
        cond = lf_row['condition'].iloc[0]
        if pd.isna(cond) or cond not in conditions:
            continue

        item_label = lf_row['item'].iloc[0] if ('item' in lf_row.columns and pd.notna(lf_row['item'].iloc[0])) else trial
        label = f"{participant_id} - Item {item_label}"
        if label not in facet_yticks[cond]:
            facet_yticks[cond].append(label)

        et_row = df_part[df_part['trial'] == trial]
        dominant = et_row['dominant'].iloc[0] if not et_row.empty else None
        choice = lf_row['choice'].iloc[0] if not lf_row.empty else None

        mismatch_flag = False
        if 'mismatch' in lf_row.columns:
            try:
                mismatch_flag = bool(lf_row['mismatch'].iloc[0])
            except Exception:
                mismatch_flag = False

        facet_tick_meta[cond][label] = {
            'pid_short': str(participant_id)[:5],
            'choice': choice if pd.notna(choice) else 'NA',
            'dominant_matches': (dominant is not None and choice is not None and dominant == choice),
            'mismatch': mismatch_flag
        }

        et_file = os.path.join(et_dir_local, f"{participant_id}.csv")
        if not os.path.exists(et_file):
            continue
        df_et = pd.read_csv(et_file)
        df_et.columns = [c.lower() for c in df_et.columns]
        trial_col = 'trial' if 'trial' in df_et.columns else [c for c in df_et.columns if 'trial' in c][0]
        g = df_et[df_et[trial_col] == trial].copy()
        if g.empty:
            continue

        canvas_state = np.where(
            (g['_left_canvas'] == 1) & (g['_right_canvas'] == 0), 'L',
            np.where(
                (g['_right_canvas'] == 1) & (g['_left_canvas'] == 0), 'R',
                np.where(
                    (g['_left_canvas'] == 0) & (g['_right_canvas'] == 0), 'N', 'B'
                )
            )
        )
        transitions_idx = np.where(canvas_state[1:] != canvas_state[:-1])[0] + 1
        segment_starts = np.concatenate(([0], transitions_idx))
        segment_ends = np.concatenate((transitions_idx, [len(canvas_state)]))
        for seg_start, seg_end in zip(segment_starts, segment_ends):
            seg = g.iloc[seg_start:seg_end]
            if len(seg) > 0:
                start_time = seg['times'].iloc[0]
                end_time = seg['times'].iloc[-1]
                seg_time = (end_time - start_time) / 1000
                seg_start_s = start_time / 1000
            else:
                seg_time = 0
                seg_start_s = np.nan

            if ((seg['_left_canvas'] == 1) & (seg['_right_canvas'] == 0)).all():
                color = 'cornflowerblue'
                side = 'Left'
            elif ((seg['_right_canvas'] == 1) & (seg['_left_canvas'] == 0)).all():
                color = 'lightgreen'
                side = 'Right'
            elif ((seg['_left_canvas'] == 0) & (seg['_right_canvas'] == 0)).all():
                color = transparent_color
                side = 'None'
            else:
                color = "red"
                side = 'Both'

            row_idx = conditions.index(cond) + 1
            customdata = [[
                participant_id,
                item_label,
                trial,
                cond,
                side,
                seg_start_s,
                seg_time,
                dominant or 'N/A',
                choice or 'N/A',
                'Yes' if mismatch_flag else 'No'
            ]]
            fig.add_trace(
                go.Bar(
                    x=[seg_time],
                    y=[label],
                    name=side,
                    marker_color=color,
                    orientation='h',
                    showlegend=False,
                    customdata=customdata,
                    hovertemplate=(
                        "<b>%{customdata[0]}</b><br>"
                        "Condition: %{customdata[3]}<br>"
                        "Item: %{customdata[1]}<br>"
                        "Trial: %{customdata[2]}<br>"
                        "Side: %{customdata[4]}<br>"
                        "Segment start: %{customdata[5]:.2f}s<br>"
                        "Segment duration: %{x:.2f}s<br>"
                        "Dominant: %{customdata[7]}<br>"
                        "Choice: %{customdata[8]}<br>"
                        "Mismatch: %{customdata[9]}<extra></extra>"
                    )
                ),
                row=row_idx, col=1
            )

    # Format y-axis ticks per condition
    for i, cond in enumerate(conditions, start=1):
        labels = facet_yticks[cond]

        def format_label(lbl):
            meta = facet_tick_meta[cond].get(lbl, {})
            pid_short = meta.get('pid_short', 'NA')
            choice_val = meta.get('choice', 'NA')
            base = f"{pid_short} · {choice_val}"
            bold = meta.get('dominant_matches', False)
            mismatch = meta.get('mismatch', False)

            if bold:
                base = f"<b>{base}</b>"
            if mismatch:
                inner = f"<span style='color:crimson'>{base}</span>"
            else:
                inner = base
            return inner

        ticktext = [format_label(t) for t in labels]
        fig.update_yaxes(
            categoryorder='array',
            categoryarray=labels,
            tickvals=labels,
            ticktext=ticktext,
            showgrid=False,
            ticks='',
            row=i, col=1
        )

    fig.update_xaxes(title_text='Seconds', row=n_rows, col=1)

    fig.update_layout(
        template='plotly_white',
        barmode='stack',
        bargap=0.0,
        bargroupgap=0.0,
        title=f'Eye-Tracking: Stacked Dwell Time Segments (Participant {participant_id})',
        height=max(400, 150 * n_rows),
        showlegend=True
    )

    # show and save
    fig.show()
    out_html = plots_dir / f"dwell_times_{participant_id}.html"
    out_png = plots_dir / f"dwell_times_{participant_id}.png"
    fig.write_html(out_html, include_plotlyjs='cdn')
    try:
        fig.write_image(out_png, scale=3)
    except Exception:
        # image export may require kaleido installed
        pass

    return fig

# Example usage:
fig = plot_participant_dwell('60fd4fbf1a3f1446d576e5d4')