# Simple Feature Extraction


In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

import os
import re

import pandas as pd

from utils.paths import CLEANED_DIR, EXTRACTED_FEATURES_DIR, RAN_DIR

In [None]:
# --- CONFIGURATION ---
root_data_folder = CLEANED_DIR
ran_scores_df = pd.read_csv(RAN_DIR / "RAN_HashTable.csv")

# Container for features
features = []

### Prep Feature Extraction
- impute missing ages based on average of category
- filter rows -> use ONLY FN.*Start
- compute mean durations for fixation or saccades

In [3]:
# Means for imputation
fruitninja_age_mean = 24
dyscover_age_mean = 6
   
def process_annotated_files(files, var, folder_name, participant_folder):
    """
    Process annotated files to compute features. Use ONLY the FN.*Start events.
    This function is called for each participant folder.
    
    Args:
        files (list): List of files in the participant folder.
        var (str): The variable to process, either 'blinks', 'fixations', 'gaze', 'imu' or 'saccades'.
    """
    filename_start = f'annotated_{var}' 
    filtered_files = [f for f in files if f.startswith(filename_start)]
   
    if len(filtered_files) == 0:
        print(f"Missing annotated files in folder {folder_name}")

    first_file = filtered_files[0]

    # Load CSV
    df = pd.read_csv(os.path.join(participant_folder, first_file), low_memory=False)

    # Filter event_name
    event_pattern = 'FN.*Start'
    fn_df = df[df['event_name'].fillna('').str.contains(event_pattern)]
    
    return fn_df

# Function to compute fixations or saccades per minute
def compute_per_min_var(df):
    """
    Compute the average number of events (rows) per minute across all trials.

    For each trial (grouped by 'event_name'), this function calculates the duration in minutes
    using the 'start timestamp [ns]' and 'end timestamp [ns]' columns. It then computes the number
    of rows per minute for each trial and returns the mean value across all trials.

    Args:
        df (pd.DataFrame): A dataframe containing at least the columns:
                           - 'event_name' (used to group trials),
                           - 'start timestamp [ns]',
                           - 'end timestamp [ns]'.

    Returns:
        float: The mean number of data points (rows) per minute across all trials.
               Returns 0 if no valid trials are found.
    """
    per_min_list = []
    for trial_name, trial_df in df.groupby('event_name'):
        start_ts = trial_df['start timestamp [ns]'].min()
        end_ts = trial_df['end timestamp [ns]'].max()
        duration_min = (end_ts - start_ts) / (60 * 1e9)
        if duration_min > 0:
            num_var = len(trial_df)
            var_per_min = num_var / duration_min
            per_min_list.append(var_per_min)
            
    # Compute participant means
    mean_per_min = sum(per_min_list) / len(per_min_list) if per_min_list else 0
    return mean_per_min 

### Standard Features
['mean_fix_duration', 'median_fix_duration', 'mean_saccade_length',
       'median_saccade_length', 'mean_blink_duration', 'median_blink_duration',
       'num_fixations_per_min', 'num_blinks_per_min', 'num_saccades_per_min',
       'age', 'ran_score']

In [4]:
# Loop through participant folders
for folder_name in os.listdir(root_data_folder):

    if folder_name.startswith('ERROR_'):
        print(f"Skipping folder {folder_name} (ERROR_)")
        continue

    participant_folder = os.path.join(root_data_folder, folder_name)

    if not os.path.isdir(participant_folder):
        continue

    try:
        # Extract HASH timestamp from folder name
        match = re.search(r'\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}', folder_name)
        if match:
            hash_str = match.group(0)
        else:
            print(f"No HASH match for folder: {folder_name}")
            continue

        # Find RAN row where HASH column CONTAINS the timestamp
        ran_row = ran_scores_df[ran_scores_df['HASH'].str.contains(hash_str, na=False)]
        if ran_row.empty:
            print(f"No RAN score found for HASH: {hash_str} in folder {folder_name}")
            continue

        ran_score = ran_row['objectPerSecond'].values[0]

        # Impute age
        raw_age = ran_row['Age'].values[0]
        group = ran_row['Group'].values[0].strip().lower()

        if pd.isna(raw_age):
            if group == 'dyscover':
                age = dyscover_age_mean
                print(f"Imputed age for group '{group}' as {age} for HASH {hash_str}")
            elif group in ['fruitninja', 'adultspring']:
                age = fruitninja_age_mean
                print(f"Imputed age for group '{group}' as {age} for HASH {hash_str}")
        else:
            age = raw_age

        # List files in participant folder
        files = os.listdir(participant_folder)

        fix_fn_df = process_annotated_files(files, 'fixations', folder_name, participant_folder)
        sacc_fn_df = process_annotated_files(files, 'saccades', folder_name, participant_folder)
        blinks_fn_df = process_annotated_files(files, 'blinks', folder_name, participant_folder)

        # Per trial fixations or saccades per minute        
        mean_fix_per_min = compute_per_min_var(fix_fn_df)
        mean_sacc_per_min = compute_per_min_var(sacc_fn_df)
        mean_blinks_per_min = compute_per_min_var(blinks_fn_df)

        # Store features
        features.append({
            'participant_folder': folder_name,
            'HASH': hash_str,
            'group': group,
            'mean_fix_duration': fix_fn_df['duration [ms]'].mean(),
            'median_fix_duration': fix_fn_df['duration [ms]'].median(),
            'mean_saccade_length': sacc_fn_df['amplitude [px]'].mean(),
            'median_saccade_length': sacc_fn_df['amplitude [px]'].median(),
            'mean_blink_duration': blinks_fn_df['duration [ms]'].mean(),
            'median_blink_duration': blinks_fn_df['duration [ms]'].median(),
            'num_fixations_per_min': mean_fix_per_min,
            'num_blinks_per_min': mean_blinks_per_min,
            'num_saccades_per_min': mean_sacc_per_min,
            'age': age,
            'ran_score': ran_score
        })

    except Exception as e:
        print(f"Error processing {folder_name}: {e}")


Imputed age for group 'adultspring' as 24 for HASH 2025-04-29_15-55-26
Skipping folder ERROR_Filtered_DysCover_2024-06-06_14-41-34-85e33d1a_3d679ae7_0.0-1568.742_yolo11 (ERROR_)
Imputed age for group 'adultspring' as 24 for HASH 2025-04-17_11-52-56
Imputed age for group 'adultspring' as 24 for HASH 2025-03-31_09-49-48
Imputed age for group 'adultspring' as 24 for HASH 2025-04-28_15-30-49
Imputed age for group 'adultspring' as 24 for HASH 2025-03-31_10-29-39
Imputed age for group 'adultspring' as 24 for HASH 2025-04-15_10-47-26
Imputed age for group 'adultspring' as 24 for HASH 2025-04-23_11-39-23
Skipping folder ERROR_Filtered_Adult_Spring_2025-04-18_11-40-33-4349baf8_447aca05_0.0-1631.454_yolo11 (ERROR_)
Imputed age for group 'adultspring' as 24 for HASH 2025-04-24_16-47-08
Imputed age for group 'adultspring' as 24 for HASH 2025-04-18_10-30-25
No RAN score found for HASH: 2024-03-27_11-19-10 in folder Filtered_Fruit_Ninja_2024-03-27_11-19-10-d39c49e2_365a25a9_0.0-1061.854_yolo11
Imput

In [5]:
# --- Final dataframe ---
features_df = pd.DataFrame(features)

features_df_numeric = features_df.select_dtypes(include=['number'])
print('N:', len(features_df))


N: 58


In [6]:
features_df_numeric.columns

Index(['mean_fix_duration', 'median_fix_duration', 'mean_saccade_length',
       'median_saccade_length', 'mean_blink_duration', 'median_blink_duration',
       'num_fixations_per_min', 'num_blinks_per_min', 'num_saccades_per_min',
       'age', 'ran_score'],
      dtype='object')

In [7]:
# save full CSV
output_csv_path = EXTRACTED_FEATURES_DIR / 'simple_feature_extraction.csv'
overwrite = False  # Set to True if you want to overwrite existing files

if not os.path.exists(output_csv_path) or overwrite:
    features_df.to_csv(output_csv_path, index=False)
    print(f"Features saved to {output_csv_path}")

else:
    print(f"File already exists: {output_csv_path}. Not overwriting.")

File already exists: /HOME/lecomteo/thesis/master_thesis/data/processed/extracted_features/simple_feature_extraction.csv. Not overwriting.
