In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

import os
from functools import reduce

import pandas as pd

from utils.paths import EXTRACTED_FEATURES_DIR

In [2]:
# Load CSVs
df_basic = pd.read_csv(EXTRACTED_FEATURES_DIR / "simple_feature_extraction.csv")
df_aoi = pd.read_csv(EXTRACTED_FEATURES_DIR / "fixation_distance_aoi.csv")
df_latency = pd.read_csv(EXTRACTED_FEATURES_DIR / "latency.csv")
df_fic_ffd = pd.read_csv(EXTRACTED_FEATURES_DIR / "fic_ffd_features.csv")

# Extract sets of HASHes
hashes_basic = set(df_basic["HASH"])
hashes_aoi = set(df_aoi["HASH"])
hashes_latency = set(df_latency["HASH"])
hashes_fic_ffd = set(df_fic_ffd["HASH"])

# Get the union of all hashes
all_hashes = hashes_basic | hashes_aoi | hashes_latency | hashes_fic_ffd

# Create a quick summary table
summary = []
for h in all_hashes:
    summary.append(
        {
            "HASH": h,
            "in_basic": h in hashes_basic,
            "in_aoi": h in hashes_aoi,
            "in_latency": h in hashes_latency,
            "in_fic_ffd": h in hashes_fic_ffd,
        }
    )

df_summary = pd.DataFrame(summary)

# Filter for any HASH not present in all 4 files
missing_hashes = df_summary[
    ~(df_summary[["in_basic", "in_aoi", "in_latency", "in_fic_ffd"]].all(axis=1))
]
missing_hashes

Unnamed: 0,HASH,in_basic,in_aoi,in_latency,in_fic_ffd
22,2024-03-27_11-19-10,False,True,True,True


Remove row from AOI, latency, and FIC_FDD files.

In [3]:
# Merge on 'HASH' using INNER JOIN to keep only participants present in all files
dfs = [df_basic, df_aoi, df_latency, df_fic_ffd]
merged_df = reduce(lambda left, right: pd.merge(left, right, on='HASH', how='inner'), dfs)

# Check for missing values
missing = merged_df.isnull().sum()
missing_nonzero = missing[missing > 0]
print("Columns with missing values:\n", missing_nonzero)


Columns with missing values:
 Series([], dtype: int64)


In [4]:
merged_df.columns

Index(['participant_folder', 'HASH', 'group', 'mean_fix_duration',
       'median_fix_duration', 'mean_saccade_length', 'median_saccade_length',
       'mean_blink_duration', 'median_blink_duration', 'num_fixations_per_min',
       'num_blinks_per_min', 'num_saccades_per_min', 'age', 'ran_score',
       'filename_x', 'folder_x', 'mean_fix_dist_to_object',
       'median_fix_dist_to_object', 'mean_fix_dist_to_fruit',
       'median_fix_dist_to_fruit', 'percent_fixations_outside_bbox',
       'mean_latency', 'median_latency', 'std_latency', 'min_latency',
       'max_latency', 'percent_unseen_fruits', 'n_fruits', 'n_seen',
       'folder_y', 'participant', 'filename_y', 'folder', 'FIC_mean',
       'FIC_std', 'FFD'],
      dtype='object')

In [5]:
merged_df.drop(columns=['folder_x', 'folder_y', 'participant'], inplace=True)

In [6]:
rows_with_missing = merged_df[merged_df.isnull().any(axis=1)]
rows_with_missing

Unnamed: 0,participant_folder,HASH,group,mean_fix_duration,median_fix_duration,mean_saccade_length,median_saccade_length,mean_blink_duration,median_blink_duration,num_fixations_per_min,...,min_latency,max_latency,percent_unseen_fruits,n_fruits,n_seen,filename_y,folder,FIC_mean,FIC_std,FFD


In [7]:
# Quick Describe Check
check = merged_df.describe()
pd.set_option('display.max_columns', 100) 
check

Unnamed: 0,mean_fix_duration,median_fix_duration,mean_saccade_length,median_saccade_length,mean_blink_duration,median_blink_duration,num_fixations_per_min,num_blinks_per_min,num_saccades_per_min,age,ran_score,mean_fix_dist_to_object,median_fix_dist_to_object,mean_fix_dist_to_fruit,median_fix_dist_to_fruit,percent_fixations_outside_bbox,mean_latency,median_latency,std_latency,min_latency,max_latency,percent_unseen_fruits,n_fruits,n_seen,FIC_mean,FIC_std,FFD
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,360.05433,275.646552,129.958983,108.725327,256.527581,255.948276,145.353352,34.828327,146.887954,18.603448,1.394125,117.912649,76.608261,120.213763,78.969528,37.488906,140.721966,15.79536,453.708596,0.0,3971.324412,82.380487,770.172414,140.396552,31.467941,54.218754,1.224426
std,60.212738,43.557686,22.104997,20.482479,28.973122,29.793984,20.928403,56.038512,21.522554,8.689457,0.629426,44.509157,35.396286,47.817142,46.06926,7.195255,82.106211,21.633849,491.709668,0.0,5506.430283,7.107541,260.405361,72.673251,12.50735,37.419909,0.038187
min,244.295493,185.0,91.451013,73.10531,190.0,190.0,112.916298,5.434204,113.392684,4.0,0.31746,72.430104,47.172079,73.95819,46.54832,26.439791,0.0,0.0,0.0,0.0,0.0,52.13205,135.0,4.0,14.251645,21.248018,1.131644
25%,323.796098,245.0,116.075241,95.563293,242.347413,240.0,128.412133,10.292177,129.382937,6.0,0.755556,93.900061,62.0996,95.95549,62.256813,32.452875,96.651519,0.0,244.322359,0.0,1749.119873,78.808929,640.0,87.0,23.762733,33.558519,1.203303
50%,355.620412,272.5,125.001999,104.722898,259.778541,255.0,146.491838,17.738211,147.756918,24.0,1.584141,108.123224,71.060068,110.665581,72.010733,35.127494,121.20595,5.062622,358.843054,0.0,3092.942383,83.542918,811.5,136.5,28.164694,40.037803,1.228709
75%,411.463392,300.625,142.250678,117.01068,276.425106,278.75,157.651808,28.490368,160.081413,24.0,1.875,125.370819,81.295448,127.956204,83.257355,40.814489,157.924869,23.779846,505.769859,0.0,3789.656921,87.9391,965.0,189.5,37.248472,63.13387,1.237732
max,473.67614,380.0,189.616256,170.78485,318.870968,320.0,199.44259,321.134351,200.429469,34.0,2.473411,354.002472,315.613025,379.400605,402.502371,58.427474,518.958305,97.561035,3332.941826,0.0,35572.454102,97.037037,1189.0,348.0,91.49956,235.381298,1.372469


In [8]:
merged_df[['mean_latency', 'median_latency', 'std_latency', 'min_latency', 'max_latency', 'HASH']]

Unnamed: 0,mean_latency,median_latency,std_latency,min_latency,max_latency,HASH
0,0.0,0.0,0.0,0.0,0.0,2024-06-05_10-34-55
1,148.895115,0.0,396.92785,0.0,3117.873047,2025-04-29_15-55-26
2,140.196226,47.5,365.766599,0.0,2887.624023,2024-06-06_13-47-36
3,518.958305,20.0,3332.941826,0.0,35572.454102,2025-04-17_11-52-56
4,201.257322,35.125,507.622451,0.0,3368.125244,2024-06-06_10-02-38
5,175.293621,10.000244,512.109161,0.0,3763.499756,2025-03-31_09-49-48
6,149.264129,52.500122,500.212084,0.0,5244.753906,2025-04-28_15-30-49
7,93.925205,20.0,250.483671,0.0,1931.749023,2025-03-31_10-29-39
8,235.30544,5.000244,628.416768,0.0,4619.240967,2025-04-15_10-47-26
9,46.368691,5.125,57.604606,0.0,195.25,2024-06-06_14-21-45


In [9]:
# Drop participant with faulty latency files
merged_df.drop(index=1, axis=0, inplace=True)

# Drop Minimum Latency due to no variance
merged_df.drop(columns=['min_latency'], inplace=True)

print('Number of participants: ', len(merged_df))
print('Number of features:', len(merged_df.columns))
merged_df.head()


Number of participants:  57
Number of features: 32


Unnamed: 0,participant_folder,HASH,group,mean_fix_duration,median_fix_duration,mean_saccade_length,median_saccade_length,mean_blink_duration,median_blink_duration,num_fixations_per_min,num_blinks_per_min,num_saccades_per_min,age,ran_score,filename_x,mean_fix_dist_to_object,median_fix_dist_to_object,mean_fix_dist_to_fruit,median_fix_dist_to_fruit,percent_fixations_outside_bbox,mean_latency,median_latency,std_latency,max_latency,percent_unseen_fruits,n_fruits,n_seen,filename_y,folder,FIC_mean,FIC_std,FFD
0,Filtered_DysCover_2024-06-05_10-34-55-e4242330...,2024-06-05_10-34-55,dyscover,355.413534,262.5,174.593112,129.459258,205.357143,205.0,146.369736,74.811095,148.383351,5.0,0.465116,annotated_gaze_Filtered_DysCover_2024-06-05_10...,208.377782,120.57969,206.565141,120.345183,42.560554,0.0,0.0,0.0,0.0,97.037037,135,4,annotated_gaze_Filtered_DysCover_2024-06-05_10...,Filtered_DysCover_2024-06-05_10-34-55-e4242330...,27.698962,39.505981,1.235903
2,Filtered_DysCover_2024-06-06_13-47-36-8fd41dd9...,2024-06-06_13-47-36,dyscover,470.317419,335.0,156.351444,136.706395,221.576577,220.0,114.855289,25.175696,116.462045,5.0,0.31746,annotated_gaze_Filtered_DysCover_2024-06-06_13...,140.894247,96.259168,143.18188,99.125023,37.701396,140.196226,47.5,365.766599,2887.624023,88.253968,630,74,annotated_gaze_Filtered_DysCover_2024-06-06_13...,Filtered_DysCover_2024-06-06_13-47-36-8fd41dd9...,30.168636,39.585317,1.270388
3,Filtered_Adult_Spring_2025-04-17_11-52-56-866f...,2025-04-17_11-52-56,adultspring,294.963428,230.0,141.825833,120.966473,261.256614,260.0,166.670052,23.233069,171.674232,24.0,1.578283,annotated_gaze_Filtered_Adult_Spring_2025-04-1...,123.376752,87.011314,124.044827,86.173297,40.654206,518.958305,20.0,3332.941826,35572.454102,88.453815,996,115,annotated_gaze_Filtered_Adult_Spring_2025-04-1...,Filtered_Adult_Spring_2025-04-17_11-52-56-866f...,27.716706,40.49029,1.218467
4,Filtered_DysCover_2024-06-06_10-02-38-8cf987ec...,2024-06-06_10-02-38,dyscover,343.966074,250.0,133.209223,107.781549,226.363636,225.0,149.941643,11.02539,150.722713,6.0,0.425532,annotated_gaze_Filtered_DysCover_2024-06-06_10...,108.886629,74.049489,111.309669,75.62859,33.187773,201.257322,35.125,507.622451,3368.125244,83.836858,662,107,annotated_gaze_Filtered_DysCover_2024-06-06_10...,Filtered_DysCover_2024-06-06_10-02-38-8cf987ec...,29.181659,37.661028,1.236929
5,Filtered_Adult_Spring_2025-03-31_09-49-48-2738...,2025-03-31_09-49-48,adultspring,283.525456,225.0,135.272819,112.704527,264.636364,265.0,163.78263,36.223401,163.85726,24.0,0.894454,annotated_gaze_Filtered_Adult_Spring_2025-03-3...,93.544382,62.203953,95.677422,63.721096,35.254989,175.293621,10.000244,512.109161,3763.499756,71.122536,1167,337,annotated_gaze_Filtered_Adult_Spring_2025-03-3...,Filtered_Adult_Spring_2025-03-31_09-49-48-2738...,19.569845,31.326708,1.181386


In [10]:
print("length of df:", len(merged_df))
print("columns of df:", merged_df.columns)
print("group counts", merged_df.groupby("group").size())

length of df: 57
columns of df: Index(['participant_folder', 'HASH', 'group', 'mean_fix_duration',
       'median_fix_duration', 'mean_saccade_length', 'median_saccade_length',
       'mean_blink_duration', 'median_blink_duration', 'num_fixations_per_min',
       'num_blinks_per_min', 'num_saccades_per_min', 'age', 'ran_score',
       'filename_x', 'mean_fix_dist_to_object', 'median_fix_dist_to_object',
       'mean_fix_dist_to_fruit', 'median_fix_dist_to_fruit',
       'percent_fixations_outside_bbox', 'mean_latency', 'median_latency',
       'std_latency', 'max_latency', 'percent_unseen_fruits', 'n_fruits',
       'n_seen', 'filename_y', 'folder', 'FIC_mean', 'FIC_std', 'FFD'],
      dtype='object')
group counts group
adultspring    26
dyscover       17
fruitninja     14
dtype: int64


In [12]:
# -- SAVE AS CSV ---
output_csv_path = EXTRACTED_FEATURES_DIR / 'merged_features.csv'
overwrite = False  # Set to True if you want to overwrite existing files

if not os.path.exists(output_csv_path) or overwrite:
    merged_df.to_csv(output_csv_path, index=False)
    print(f"Features saved to {output_csv_path}")

else:
    print(f"File already exists: {output_csv_path}. Not overwriting.")


File already exists: /HOME/lecomteo/thesis/master_thesis/data/processed/extracted_features/merged_features.csv. Not overwriting.
