# Parse PCIbex behavioral results into a clean DataFrame

This notebook parses the messy results file `results_dev.csv` and produces an event-level DataFrame with:

- participant_id, group, label (block), no, item, condition, cb, left, right
- Eye-tracker events (calibration and filename) with timestamps
- Self-paced reading region events (r1..r7) with per-region RTs
- Optional question RTs and choice selection (side + RT)

You can adapt the parsing rules here if your logging format changes.

In [1]:
# Imports and paths
import re
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(r"c:\\Users\\parti\\Projects\\pcibex-hun")
RAW_FILE = ROOT / "results_dev.csv"

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 50)

print("Will parse:", RAW_FILE)

Will parse: c:\Users\parti\Projects\pcibex-hun\results_dev.csv


In [2]:
# Read file, split comments vs data
raw_lines = RAW_FILE.read_text(encoding="utf-8").splitlines()

header_comments = []
rows = []
for line in raw_lines:
    if line.startswith("#"):
        header_comments.append(line)
    elif line.strip():
        rows.append(line)

print(f"Comment lines: {len(header_comments)} | Data rows: {len(rows)}")
rows[:3]

Comment lines: 132 | Data rows: 359


['1758265706,7270d3743b7440ead8140ae1e2cf0255,PennController,0,0,welcome,NULL,PennController,0,_Trial_,Start,1758265522763,prolific_id,NULL',
 '1758265706,7270d3743b7440ead8140ae1e2cf0255,PennController,0,0,welcome,NULL,PennController,0,_Header_,Start,1758265522763,prolific_id,NULL',
 '1758265706,7270d3743b7440ead8140ae1e2cf0255,PennController,0,0,welcome,NULL,PennController,0,_Header_,End,1758265522763,prolific_id,NULL']

In [3]:
# Extract global metadata from the header comments
import datetime as dt

from sympy import Add

meta = {
    "results_on": None,
    "user_agent": None,
    "design_number": None,
}

for c in header_comments:
    if c.startswith("# Results on "):
        meta["results_on"] = c.replace("# Results on ", "").strip()
    elif c.startswith("# USER AGENT:"):
        meta["user_agent"] = c.replace("# USER AGENT:", "").strip()
    elif "Design number" in c:
        m = re.search(r"Design number .*?=\s*(\d+)", c)
        if m:
            meta["design_number"] = int(m.group(1))

meta

# Add these values to the end?

{'results_on': 'Fri, 19 Sep 2025 14:45:34 GMT',
 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
 'design_number': 20}

## Functions

## Events dataframe

In [4]:
# Define the base schema described by comments just before each block
base_cols = [
    "ResultsTime", "MD5", "Controller", "Order", "Inner", "Label",
    "LatinSquare", "PennElementType", "PennElementName", "Parameter",
    "Value", "EventTime", "prolific_id",
]

# Do NOT create "extraN" columns. Instead, infer the appended (trial) column names
# from the header comments, then map the trailing fields directly to those names.
import re

def extract_trial_fields_from_comments(comments):
    # We expect something like a line that mentions these names.
    allowed = ["participant_id", "group", "no", "item", "condition", "cb", "left", "right"]
    pattern = re.compile(r"(participant_id|group|no|item|condition|cb|left|right)", re.I)
    best_line = None
    max_hits = 0
    for c in comments:
        hits = len(pattern.findall(c))
        if hits > max_hits:
            best_line = c
            max_hits = hits
    if best_line and max_hits >= 6:
        # Try to parse comma-separated tokens after a ':' if present
        text = best_line.split(":", 1)[-1]
        tokens = [t.strip() for t in text.split(",")]
        def norm(s):
            return re.sub(r"[^A-Za-z0-9_]", "", s).lower()
        names = []
        for t in tokens:
            n = norm(t)
            if n in allowed and n not in names:
                names.append(n)
        # Preserve canonical order
        ordered = [name for name in allowed if name in names]
        if len(ordered) >= 6:
            return ordered
    # Fallback to the canonical expected list
    return allowed

TRIAL_FIELDS = extract_trial_fields_from_comments(header_comments)
print("Using trial fields:", TRIAL_FIELDS)

# Parse each data row into a record using base_cols + TRIAL_FIELDS depending on Label

def parse_row_to_record(line: str):
    parts = [p.strip() for p in line.split(",")]
    rec = {}
    base_vals = parts[:len(base_cols)]
    extra_vals = parts[len(base_cols):]

    for k, v in zip(base_cols, base_vals):
        rec[k] = v

    label = rec.get("Label")
    if label in ("practice", "experiment"):
        n = min(len(extra_vals), len(TRIAL_FIELDS))
        for k, v in zip(TRIAL_FIELDS[:n], extra_vals[:n]):
            rec[k] = v
    elif label in ("participant_data",):
        if extra_vals:
            rec["participant_id"] = extra_vals[0]

    return rec

records = [parse_row_to_record(l) for l in rows]
raw_df = pd.DataFrame.from_records(records)

# Cast some known numeric columns where possible
for c in ["ResultsTime", "Order", "Inner", "EventTime", "no", "item"]:
    if c in raw_df.columns:
        raw_df[c] = pd.to_numeric(raw_df[c], errors="coerce")

# Human-readable timestamps
raw_df["results_time"] = pd.to_datetime(raw_df["ResultsTime"], unit="s", utc=True)

# IMPORTANT: Keep EventTime both as numeric milliseconds and as a proper timestamp (UTC)
# - EventTime_ms: numeric milliseconds for computations (diffs, means)
# - event_time: Pandas Timestamp (UTC) for readability
raw_df["event_time_ms"] = raw_df["EventTime"]
raw_df["event_time"] = pd.to_datetime(raw_df["event_time_ms"], unit="ms", utc=True)

################################################

# Helper: derive per-trial fields now that we named extras explicitly in raw_df
# We keep a light decoder only to ensure participant_id is present and to normalize types.

EXPECTED_FIELDS = ["participant_id", "group", "no", "item", "condition", "cb", "left", "right"]

# Make sure that item and no are integers
# Make sure that item and no are integers
for col in ["no", "item"]:
    if col in raw_df.columns:
        # Use pandas nullable Int64 type to keep NaNs and force integer dtype
        raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce").astype('Int64')
        
# Ensure all expected columns exist even if missing in some rows
for col in EXPECTED_FIELDS:
    if col not in raw_df.columns:
        raw_df[col] = None

# Build df and forward fill participant_id/group as before
df = raw_df.copy()

# Forward-fill participant_id only within blocks of identical participant_id (no cross-over)
if "participant_id" in df.columns:
    mask_pid = df["participant_id"].notna()
    df.loc[mask_pid, "participant_id"] = df.loc[mask_pid, "participant_id"]
    # No fill for missing participant_id, keep as NaN

# Backward-fill group only within blocks of identical participant_id (no cross-over)
if "group" in df.columns and "participant_id" in df.columns:
    def safe_bfill_group(subdf):
        subdf["group"] = subdf["group"].replace({"": None, "NULL": None}).bfill()
        return subdf
    df = df.groupby("participant_id", group_keys=False).apply(safe_bfill_group)

# Derive block-type flags
# df["is_practice"] = df["Label"].eq("practice")
# df["is_experiment"] = df["Label"].eq("experiment")

# For convenience: also include a local-time copy if desired (commented)
# df["results_timestamp_local"] = df["results_timestamp"].dt.tz_convert("Europe/Budapest")

# Drop Controller column
df.drop(columns=["ResultsTime", "Controller", "Inner", "LatinSquare", "EventTime", "prolific_id"], inplace=True)

# Add simple elapsed time between events: current row EventTime_ms minus previous row EventTime_ms
if 'elapsed_ms' in df.columns:
    df.drop(columns=['elapsed_ms'], inplace=True)
df['elapsed_ms'] = df['event_time_ms'].diff()
df.drop(columns=['event_time_ms'], inplace=True)

# # Drop rows where the Parameter column starts and ends with '_'
# df = df[~df["Parameter"].str.match(r"^_.*_$", na=False)]

# # Keep only rows where Label is practice or experiment
# df = df[df["Label"].isin(["practice", "experiment"])] 

# In Values column rename right_canvas_practice to right_canvas
df["Value"] = df["Value"].replace({"right_canvas_practice": "right_canvas"})
df["Value"] = df["Value"].replace({"left_canvas_practice": "left_canvas"})

df

Using trial fields: ['participant_id', 'group', 'no', 'item', 'condition', 'cb', 'left', 'right']


  df = df.groupby("participant_id", group_keys=False).apply(safe_bfill_group)


Unnamed: 0,MD5,Order,Label,PennElementType,PennElementName,Parameter,Value,participant_id,group,no,item,condition,cb,left,right,results_time,event_time,elapsed_ms
14,7270d3743b7440ead8140ae1e2cf0255,4,participant_data,PennController,3,_Trial_,Start,Yun999,a,,,,,,,2025-09-19 07:08:26+00:00,2025-09-19 07:06:09.379000+00:00,
15,7270d3743b7440ead8140ae1e2cf0255,4,participant_data,PennController,3,_Header_,Start,Yun999,a,,,,,,,2025-09-19 07:08:26+00:00,2025-09-19 07:06:09.379000+00:00,0.0
16,7270d3743b7440ead8140ae1e2cf0255,4,participant_data,PennController,3,_Header_,End,Yun999,a,,,,,,,2025-09-19 07:08:26+00:00,2025-09-19 07:06:09.379000+00:00,0.0
17,7270d3743b7440ead8140ae1e2cf0255,4,participant_data,TextInput,input_participant_id,EnterReturn,Yun999,Yun999,a,,,,,,,2025-09-19 07:08:26+00:00,2025-09-19 07:06:17.475000+00:00,8096.0
18,7270d3743b7440ead8140ae1e2cf0255,4,participant_data,TextInput,input_participant_id,Final,Yun999,Yun999,a,,,,,,,2025-09-19 07:08:26+00:00,2025-09-19 07:06:22.547000+00:00,5072.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,7270d3743b7440ead8140ae1e2cf0255,10,experiment,Key,r5,PressedKey,,Yun777,a,8,3,focus,y,3b,3a,2025-09-19 14:45:34+00:00,2025-09-19 14:45:29.331000+00:00,163.0
355,7270d3743b7440ead8140ae1e2cf0255,10,experiment,Key,r6,PressedKey,,Yun777,a,8,3,focus,y,3b,3a,2025-09-19 14:45:34+00:00,2025-09-19 14:45:29.493000+00:00,162.0
356,7270d3743b7440ead8140ae1e2cf0255,10,experiment,Key,r7,PressedKey,,Yun777,a,8,3,focus,y,3b,3a,2025-09-19 14:45:34+00:00,2025-09-19 14:45:29.760000+00:00,267.0
357,7270d3743b7440ead8140ae1e2cf0255,10,experiment,Selector,choice_selector,Selection,right_canvas,Yun777,a,8,3,focus,y,3b,3a,2025-09-19 14:45:34+00:00,2025-09-19 14:45:35.032000+00:00,5272.0


In [5]:
def extract_participant_info(events_df, fallback_df=None):
    # Extract participant_id, results_time, and EyeTracker filename
    pid = events_df["participant_id"].dropna().astype(str).iloc[0] if "participant_id" in events_df.columns and events_df["participant_id"].notna().any() else None
    results_time = events_df["results_time"].iloc[0] if "results_time" in events_df.columns and len(events_df) else None

    # Exact requirement: Value where PennElementType == 'EyeTracker' and Parameter == 'Filename'
    def find_et_filename(df_like):
        if df_like is None or not {"PennElementType", "Parameter", "Value"}.issubset(df_like.columns):
            return None
        pet = df_like["PennElementType"].astype(str).str.lower()
        par = df_like["Parameter"].astype(str).str.lower()
        mask = pet.eq("eyetracker") & par.eq("filename")
        vals = df_like.loc[mask, "Value"].dropna().astype(str)
        return vals.iloc[0] if len(vals) > 0 else None

    et = find_et_filename(events_df)
    if et is None and fallback_df is not None:
        et = find_et_filename(fallback_df)

    return pd.DataFrame([
        {
            "participant_id": pid,
            "eyetracker_filename": et,
            "results_time": results_time,
        }
    ])

# Build participants_df for all participants (uses raw_df as fallback to find filename)
parts = []
df_nonnull = df[df['participant_id'].notna()]
for pid, g in df_nonnull.groupby('participant_id', sort=False):
    parts.append(extract_participant_info(g, fallback_df=raw_df))
participants_df = pd.concat(parts, ignore_index=True)
participants_df

# Use df and provide raw_df as fallback
# participants_df = extract_participant_info(df)

Unnamed: 0,participant_id,eyetracker_filename,results_time
0,Yun999,httpsfarmpcibexnetrDigcCS/2d0ba9c9-ad29-63f2-3...,2025-09-19 07:08:26+00:00
1,Yun888,httpsfarmpcibexnetrDigcCS/473c6111-45c4-d323-3...,2025-09-19 07:33:47+00:00
2,Yun777,httpsfarmpcibexnetrDigcCS/79bf507e-c7d4-f50d-b...,2025-09-19 14:45:34+00:00


In [6]:
import pandas as pd
import requests
from io import StringIO

def download_and_save_eyetracking_data():
    out_dir = 'eyetracking_data'
    eturl = "https://mondo1.dreamhosters.com/script.php?experiment="
    for _, row in participants_df.iterrows():
        et_filename = row['eyetracker_filename']
        participant_id = row['participant_id']
        if pd.notnull(et_filename) and pd.notnull(participant_id):
            et_file = eturl + et_filename
            try:
                r = requests.get(et_file, timeout=15)
                r.raise_for_status()
                df_et = pd.read_csv(StringIO(r.text))
                df_et.to_csv(f"{out_dir}/{participant_id}.csv", index=False)
            except Exception as e:
                print(f"Failed for {participant_id}: {e}")
    print("Done!")
# Example usage:
download_and_save_eyetracking_data()

Done!


## Create a long-form events DataFrame

In [7]:
# Filter for experiment/practice trials only
df_trials = df[df['Label'].isin(['experiment', 'practice'])].copy()

# Sort for deterministic grouping
df_trials = df_trials.sort_values(['MD5', 'participant_id', 'no', 'item'])

# Region RTs: r1..r7
region_names = [f"r{i}" for i in range(1, 8)]
is_region = df_trials['PennElementName'].str.lower().isin(region_names) & df_trials['Parameter'].str.lower().eq('pressedkey')
regions = df_trials[is_region].copy()
regions['region_idx'] = regions['PennElementName'].str.extract(r'r(\d)').astype(int)

# Build a full trial index to ensure all trials are present
trial_index_cols = ['MD5', 'participant_id', 'group', 'Label', 'no', 'item', 'cb', 'left', 'right']
trial_index = df_trials.drop_duplicates(subset=trial_index_cols)[trial_index_cols].sort_values(trial_index_cols)

# Compute region RTs per trial (item) using average elapsed_ms
region_pivot = regions.pivot_table(
    index=trial_index_cols,
    columns='region_idx',
    values='elapsed_ms',
    aggfunc='mean',
    fill_value=np.nan
    )
region_pivot = region_pivot.reindex(trial_index.set_index(trial_index_cols).index, fill_value=np.nan)
region_pivot = region_pivot.rename(columns={i: f"r{int(i)}" for i in region_pivot.columns})

# Question RT: average elapsed_ms for question keypress
is_question = df_trials['PennElementName'].str.lower().eq('question') & df_trials['Parameter'].str.lower().eq('pressedkey')
questions = df_trials[is_question].copy()
questions = questions.sort_values(['MD5', 'participant_id', 'no', 'item'])
questions['question_rt'] = questions['elapsed_ms']
question_rt = questions.groupby(trial_index_cols)['question_rt'].mean()

# Choice RT and value: average elapsed_ms for choice selection
is_choice = df_trials['PennElementType'].str.lower().eq('selector') & df_trials['Parameter'].str.lower().eq('selection')
choices = df_trials[is_choice].copy()
choices = choices.sort_values(['MD5', 'participant_id', 'no', 'item'])
choices['choice_rt'] = choices['elapsed_ms']
choices_value = choices.groupby(trial_index_cols)['Value'].first()
choice_rt = choices.groupby(trial_index_cols)['choice_rt'].mean()

# Assemble longform DataFrame
longform = region_pivot.copy()
longform['question_rt'] = question_rt
longform['choice_rt'] = choice_rt
longform['choice_value'] = choices_value

# Substract 3 seconds from every choice_RT
longform['question_rt'] = longform['question_rt'] - 1000
longform['choice_rt'] = longform['choice_rt'] - 3000

longform = longform.reset_index()
longform

region_idx,MD5,participant_id,group,Label,no,item,cb,left,right,r1,r2,r3,r4,r5,r6,r7,question_rt,choice_rt,choice_value
0,7270d3743b7440ead8140ae1e2cf0255,Yun777,a,experiment,8,3,y,3b,3a,604.0,243.0,185.0,192.0,163.0,162.0,267.0,,2272.0,right_canvas
1,7270d3743b7440ead8140ae1e2cf0255,Yun777,a,experiment,51,17,n,17a,17b,914.0,783.0,788.0,696.0,1068.0,1086.0,739.0,,4474.0,right_canvas
2,7270d3743b7440ead8140ae1e2cf0255,Yun777,a,practice,990,0,n,A,B,810.0,559.0,439.0,430.0,478.0,456.0,437.0,,4338.0,right_canvas
3,7270d3743b7440ead8140ae1e2cf0255,Yun777,a,practice,991,0,y,D,C,771.0,1208.0,1372.0,1501.0,1200.0,1026.0,981.0,,1692.0,right_canvas
4,7270d3743b7440ead8140ae1e2cf0255,Yun777,a,practice,992,0,n,E,F,689.0,404.0,328.0,233.0,200.0,197.0,386.0,1212.0,1517.0,left_canvas
5,7270d3743b7440ead8140ae1e2cf0255,Yun888,c,experiment,32,11,y,11b,11a,-4580.0,231.0,182.0,160.0,164.0,360.0,453.0,,2094.0,left_canvas
6,7270d3743b7440ead8140ae1e2cf0255,Yun888,c,experiment,41,14,n,14a,14b,-4622.0,278.0,275.0,206.0,193.0,316.0,336.0,,4127.0,left_canvas
7,7270d3743b7440ead8140ae1e2cf0255,Yun888,c,experiment,99,30,n,tanár,diák,-7706.0,541.0,697.0,435.0,445.0,,480.0,2090.0,664.0,left_canvas
8,7270d3743b7440ead8140ae1e2cf0255,Yun888,c,experiment,113,33,n,burgonya,paradicsom,-6874.0,1095.0,770.0,663.0,698.0,,633.0,,2851.0,right_canvas
9,7270d3743b7440ead8140ae1e2cf0255,Yun888,c,practice,990,0,n,A,B,-4294.0,384.0,304.0,192.0,135.0,130.0,143.0,,4423.0,left_canvas
