In [None]:
import re
import glob
import pandas as pd
import os

def process_antisaccade_messages(msg_file):
    """
    Process an antisaccade messages file and extract intervals between 
    anti_saccade_task_start and anti_saccade_task_end.
    
    For each interval, extract:
      - start: timestamp of anti_saccade_task_start
      - end: timestamp of anti_saccade_task_end
      - time_delay: value from the "time delay stimulus=XXX/240" message
      - X: x-coordinate from "placement position=(x, y)"
      - Y: y-coordinate from "placement position=(x, y)"
      
    Returns a pandas DataFrame with one row per antisaccade interval.
    """
    data = []
    current_interval = {}
    
    with open(msg_file, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # Split the line into timestamp and message.
            parts = line.split(maxsplit=1)
            if len(parts) < 2:
                continue
            try:
                ts = float(parts[0])
            except ValueError:
                continue
            msg = parts[1].strip().lower()
            
            # Process only antisaccade events.
            if "anti_saccade_task_start" in msg:
                current_interval = {"start": ts}
            # Once an antisaccade interval has started, process its parameters.
            elif "time delay stimulus=" in msg and current_interval:
                m = re.search(r"time delay stimulus=([^\s]+)", msg)
                if m:
                    current_interval["time_delay"] = m.group(1)
            elif "placement position=" in msg and current_interval:
                m = re.search(r"placement position=\(([^)]+)\)", msg)
                if m:
                    pos_str = m.group(1)  # e.g., "-0.65, 0" or "0.65, 0"
                    pos_parts = [x.strip() for x in pos_str.split(',')]
                    if len(pos_parts) == 2:
                        current_interval["X"] = pos_parts[0]
                        current_interval["Y"] = pos_parts[1]
            elif "anti_saccade_task_end" in msg:
                current_interval["end"] = ts
                # Save the interval only if all required fields are present.
                if all(key in current_interval for key in ("start", "end", "time_delay", "X", "Y")):
                    data.append(current_interval.copy())
                current_interval = {}
    
    df = pd.DataFrame(data)
    return df

def process_antisaccade_messages_with_participant(msg_file):
    """
    Process an antisaccade message file and add the participant identifier.
    
    Returns a DataFrame with columns: start, end, time_delay, X, Y, participants.
    """
    df = process_antisaccade_messages(msg_file)
    m = re.search(r"(P\d+)", os.path.basename(msg_file))
    if m:
        df["participants"] = m.group(1)
    else:
        df["participants"] = ""
    return df

def process_calibrated_file(cal_file):
    """
    Process a calibrated gaze data file.
    
    The file should have five columns: Time, calX, calY, rawX, rawY.
    Returns a DataFrame with an added 'participants' column.
    """
    df_cal = pd.read_csv(cal_file, sep=r"\s+", header=None, engine='python')
    df_cal.columns = ['Time', 'calX', 'calY', 'rawX', 'rawY']
    df_cal['Time'] = pd.to_numeric(df_cal['Time'])
    m = re.search(r"(P\d+)", os.path.basename(cal_file))
    if m:
        df_cal["participants"] = m.group(1)
    else:
        df_cal["participants"] = ""
    return df_cal

def merge_antisaccade_intervals_with_calibrated(cal_file, msg_file):
    """
    Merge calibrated gaze data with antisaccade intervals for one participant.
    
    For each antisaccade interval (from msg_file), this function selects all calibrated
    gaze data rows (from cal_file) whose Time falls within the interval and annotates them with 
    the antisaccade properties: time_delay, X, Y and an iteration number ("interval_num").
    
    Only the first 10 intervals per participant are processed.
    
    Returns a DataFrame with the merged data.
    """
    df_cal = process_calibrated_file(cal_file)
    df_intervals = process_antisaccade_messages_with_participant(msg_file)
    
    merged_rows = []
    for i, interval in enumerate(df_intervals.itertuples(), start=1):
        if i > 10:
            break
        start, end = interval.start, interval.end
        df_interval = df_cal[(df_cal["Time"] >= start) & (df_cal["Time"] <= end)].copy()
        if df_interval.empty:
            continue
        df_interval["time_delay"] = interval.time_delay
        df_interval["X"] = interval.X
        df_interval["Y"] = interval.Y
        df_interval["interval_num"] = i
        merged_rows.append(df_interval)
    
    if merged_rows:
        return pd.concat(merged_rows, ignore_index=True)
    else:
        return pd.DataFrame()

def main():
    # Collect calibrated gaze files and message files.
    cal_files = glob.glob("../../tiny_data/gaze_data_v6/gaze_directions_calibrated_*.txt")
    msg_files = glob.glob("../../tiny_data/gaze_data_v6/gaze_messages_*.txt")
    
    # Build a mapping from participant id to its message file.
    msg_dict = {}
    for mf in msg_files:
        m = re.search(r"(P\d+)", os.path.basename(mf))
        if m:
            participant_id = m.group(1)
            msg_dict[participant_id] = mf
    
    merged_list = []
    for cf in cal_files:
        m = re.search(r"(P\d+)", os.path.basename(cf))
        if not m:
            continue
        participant_id = m.group(1)
        if participant_id in msg_dict:
            merged = merge_antisaccade_intervals_with_calibrated(cf, msg_dict[participant_id])
            if not merged.empty:
                merged_list.append(merged)
        else:
            print(f"No message file found for participant {participant_id}")
    
    if merged_list:
        df_final = pd.concat(merged_list, ignore_index=True)
        df_final = df_final.sort_values(["participants", "Time"]).reset_index(drop=True)
        df_final.to_csv("../../processed_data/processed_data_tiny_task_anti_saccades_all.csv", index=False)
        print("Merged Antisaccade Data:")
        print(df_final.head(20))
    else:
        print("No data merged.")

if __name__ == "__main__":
    main()


Merged Antisaccade Data:
            Time      calX      calY    rawX   rawY participants time_delay  \
0   1.740664e+09  -1596.37   2516.22  210.52  13.75         P004    291/240   
1   1.740664e+09  -2207.79   3049.99  210.45  14.09         P004    291/240   
2   1.740664e+09  -1606.83   2529.08  210.51  13.75         P004    291/240   
3   1.740664e+09  -2388.44   3202.37  210.44  14.18         P004    291/240   
4   1.740664e+09  -1466.40   2429.98  210.48  13.63         P004    291/240   
5   1.740664e+09  -5795.32   6829.38  209.85  14.46         P004    291/240   
6   1.740664e+09  22032.92 -25651.55  209.06  14.29         P004    291/240   
7   1.740664e+09   6085.96  -6280.74  208.85  14.59         P004    291/240   
8   1.740664e+09  20650.87 -24210.18  209.04  14.27         P004    291/240   
9   1.740664e+09   7092.88  -7691.29  208.86  14.44         P004    291/240   
10  1.740664e+09  -9844.42  12067.90  209.45  14.24         P004    291/240   
11  1.740664e+09 -10075.06 