In [None]:
import re
import glob
import pandas as pd
import os

def process_free_viewing_messages(msg_file):
    """
    Process a free viewing messages file and extract the overall experiment start and end times.
    
    The file is expected to contain a line with 'free_viewing_experiment_start' and a line with 
    'free_viewing_experiment_end'. The total duration (should be 120 seconds) is divided into 
    40 equal intervals of 3 seconds each.
    
    Returns a DataFrame with columns:
      - interval_num: 1 to 40
      - start: start timestamp of the interval
      - end: end timestamp of the interval
      - participants: participant id extracted from the filename
    """
    exp_start = None
    exp_end = None
    with open(msg_file, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(maxsplit=1)
            if len(parts) < 2:
                continue
            try:
                ts = float(parts[0])
            except ValueError:
                continue
            msg = parts[1].strip().lower()
            if "free_viewing_experiment_start" in msg:
                exp_start = ts
            elif "free_viewing_experiment_end" in msg:
                exp_end = ts
    # Check that both boundaries were found.
    if exp_start is None or exp_end is None:
        print(f"Experiment boundaries not found in {msg_file}")
        return pd.DataFrame()
    
    # Assuming the total duration is exactly 120 seconds.
    interval_length = 3.0
    intervals = []
    # Create 40 intervals.
    for i in range(40):
        start_i = exp_start + i * interval_length
        end_i = start_i + interval_length
        intervals.append({"interval_num": i+1, "start": start_i, "end": end_i})
    
    df_intervals = pd.DataFrame(intervals)
    # Add participant id.
    m = re.search(r"(P\d+)", os.path.basename(msg_file))
    if m:
        df_intervals["participants"] = m.group(1)
    else:
        df_intervals["participants"] = ""
    return df_intervals

def process_calibrated_file(cal_file):
    """
    Process a calibrated gaze data file.
    
    The file should have five columns: Time, calX, calY, rawX, rawY.
    Returns a DataFrame with an added 'participants' column.
    """
    df_cal = pd.read_csv(cal_file, sep=r"\s+", header=None, engine='python')
    df_cal.columns = ['Time', 'calX', 'calY', 'rawX', 'rawY']
    df_cal['Time'] = pd.to_numeric(df_cal['Time'])
    m = re.search(r"(P\d+)", os.path.basename(cal_file))
    if m:
        df_cal["participants"] = m.group(1)
    else:
        df_cal["participants"] = ""
    return df_cal

def merge_free_viewing_intervals_with_calibrated(cal_file, msg_file):
    """
    Merge calibrated gaze data with free viewing intervals for one participant.
    
    For each free viewing interval (extracted from the message file), this function selects all 
    calibrated gaze data rows (from cal_file) whose Time falls within the 3-second subinterval 
    and annotates them with the interval number.
    
    Returns a DataFrame with the merged data.
    """
    df_cal = process_calibrated_file(cal_file)
    df_intervals = process_free_viewing_messages(msg_file)
    
    merged_rows = []
    for interval in df_intervals.itertuples():
        start, end = interval.start, interval.end
        # Select calibrated gaze data rows within this free viewing interval.
        df_interval = df_cal[(df_cal["Time"] >= start) & (df_cal["Time"] < end)].copy()
        if df_interval.empty:
            continue
        # Annotate with the free viewing interval number.
        df_interval["interval_num"] = interval.interval_num
        merged_rows.append(df_interval)
    
    if merged_rows:
        return pd.concat(merged_rows, ignore_index=True)
    else:
        return pd.DataFrame()

# def main():
# Collect calibrated gaze files and free viewing message files.
cal_files = glob.glob("../../tiny_data/gaze_data_v6_free_viewing/gaze_directions_calibrated_*.txt")
msg_files = glob.glob("../../tiny_data/gaze_data_v6_free_viewing/gaze_messages_*.txt")

# Build a mapping from participant id to its message file.
msg_dict = {}
for mf in msg_files:
    m = re.search(r"(P\d+)", os.path.basename(mf))
    if m:
        participant_id = m.group(1)
        msg_dict[participant_id] = mf

merged_list = []
for cf in cal_files:
    m = re.search(r"(P\d+)", os.path.basename(cf))
    if not m:
        continue
    participant_id = m.group(1)
    if participant_id in msg_dict:
        merged = merge_free_viewing_intervals_with_calibrated(cf, msg_dict[participant_id])
        if not merged.empty:
            merged_list.append(merged)
    else:
        print(f"No message file found for participant {participant_id}")

if merged_list:
    df_final = pd.concat(merged_list, ignore_index=True)
    df_final = df_final.sort_values(["participants", "Time"]).reset_index(drop=True)
    df_final.to_csv("../../processed_data/processed_data_tiny_task_free_viewing.csv", index=False)
    print("Merged Free Viewing Data:")
    print(df_final.head(20))
else:
    print("No data merged.")

# if __name__ == "__main__":
#     main()


Merged Free Viewing Data:
            Time    calX     calY    rawX    rawY participants  interval_num
0   1.741257e+09  592.25  -985.14  323.36  179.53         P006             1
1   1.741257e+09  545.73 -1048.08  323.22  177.97         P006             1
2   1.741257e+09  514.65 -1146.11  323.33  176.66         P006             1
3   1.741257e+09  495.23 -1175.09  323.30  176.35         P006             1
4   1.741257e+09  484.07 -1182.02  323.26  176.27         P006             1
5   1.741257e+09  457.94 -1210.86  323.20  176.00         P006             1
6   1.741257e+09  452.09 -1219.96  323.19  175.93         P006             1
7   1.741257e+09  441.55 -1235.70  323.18  175.81         P006             1
8   1.741257e+09  440.21 -1244.11  323.20  175.76         P006             1
9   1.741257e+09  437.78 -1246.83  323.19  175.74         P006             1
10  1.741257e+09  534.15 -1251.66  323.64  175.91         P006             1
11  1.741257e+09  600.39 -1262.98  323.96  175.98 

In [6]:
grouped = df_final.groupby(['participants', 'interval_num'])
groups_dict = {}
for (participant, interval), group in grouped:
    key = f"{participant}_{interval}"
    groups_dict[key] = group
    group.to_csv(f"./free_viewing/tiny_data/{key}.csv", index=False)

In [2]:
import re
import glob
import pandas as pd
import os

In [4]:
df_final_post_cal = pd.read_csv("./processed_data/processed_data_free_viewing_all_post_process.csv")
df_final_post_cal

Unnamed: 0,Time,calX,calY,rawX,rawY,participants,interval_num,eyelink_time,x_rol_median,y_rol_median,remapX,remapY,post_calX
0,1.741257e+09,545.73,-1048.08,323.22,177.97,P006,1,50662338,323.300,176.660,301.904715,178.184615,655.868702
1,1.741257e+09,514.65,-1146.11,323.33,176.66,P006,1,50662373,323.280,176.505,301.894892,178.065385,655.611200
2,1.741257e+09,495.23,-1175.09,323.30,176.35,P006,1,50662414,323.260,176.350,301.885069,177.946154,655.340637
3,1.741257e+09,484.07,-1182.02,323.26,176.27,P006,1,50662444,323.260,176.270,301.885069,177.884615,653.535256
4,1.741257e+09,457.94,-1210.86,323.20,176.00,P006,1,50662512,323.200,176.000,301.855599,177.676923,657.147687
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108909,1.743505e+09,1471.80,843.73,327.80,235.48,P210,40,428301217,327.800,235.570,301.463855,177.750000,788.706789
108910,1.743505e+09,1487.10,851.50,327.64,235.57,P210,40,428301255,327.800,235.540,301.463855,177.744275,788.584612
108911,1.743505e+09,1469.16,852.27,327.84,235.58,P210,40,428301290,327.840,235.570,301.475904,177.750000,784.728941
108912,1.743505e+09,1460.13,849.35,327.93,235.54,P210,40,428301324,327.885,235.570,301.489458,177.750000,780.269583


In [6]:
df_new = df_final_post_cal[['eyelink_time', 'rawX', 'post_calX','participants','interval_num']].rename(
        columns={'eyelink_time': 'Time', 'post_calX': 'calX'}
    )
df_new

Unnamed: 0,Time,rawX,calX,participants,interval_num
0,50662338,323.22,655.868702,P006,1
1,50662373,323.33,655.611200,P006,1
2,50662414,323.30,655.340637,P006,1
3,50662444,323.26,653.535256,P006,1
4,50662512,323.20,657.147687,P006,1
...,...,...,...,...,...
108909,428301217,327.80,788.706789,P210,40
108910,428301255,327.64,788.584612,P210,40
108911,428301290,327.84,784.728941,P210,40
108912,428301324,327.93,780.269583,P210,40


In [7]:
grouped = df_new.groupby(['participants', 'interval_num'])
groups_dict = {}
for (participant, interval), group in grouped:
    key = f"{participant}_{interval}"
    groups_dict[key] = group
    group.to_csv(f"./free_viewing/tiny_data/{key}.csv", index=False)

In [8]:
df_final_asc = pd.read_csv("./processed_data/processed_data_asc_free_viewing_all_post_process.csv")
df_final_asc

Unnamed: 0,Time,RX,RY,state,participants,interval_num
0,50662336.0,1044.0,608.7,1.0,P006,1
1,50662337.0,1044.9,607.7,1.0,P006,1
2,50662338.0,1045.8,607.6,1.0,P006,1
3,50662339.0,1046.4,607.8,1.0,P006,1
4,50662340.0,1045.8,610.3,1.0,P006,1
...,...,...,...,...,...,...
4357905,428301439.0,750.4,592.0,2.0,P210,40
4357906,428301440.0,756.0,592.0,2.0,P210,40
4357907,428301441.0,764.2,592.2,2.0,P210,40
4357908,428301442.0,773.2,592.4,2.0,P210,40


In [9]:
grouped = df_final_asc.groupby(['participants', 'interval_num'])
groups_dict = {}
for (participant, interval), group in grouped:
    key = f"{participant}_{interval}"
    groups_dict[key] = group
    group.to_csv(f"./free_viewing/eyelink_data/{key}.csv", index=False)