In [None]:
from re import template
import os
import pandas as pd
from datetime import datetime
from skimage.filters import threshold_otsu

def find_and_process_files(base_directory):
    training_set = pd.DataFrame()
    # This is useful to keep track of each video whilst training
    cnt = 0
    for root, dirs, files in os.walk(base_directory):
        # Find required files
        time_file = next((f for f in files if 'tempos' in f and 'eval' not in f and f.endswith('.txt')), None)
        features_file = next((f for f in files if 'angle_gaze360' in f and f.endswith('.txt')), None)
        patient_labels_file = next((f for f in files if 'patientdata' in f and f.endswith('.txt')), None)
        therapist_labels_file = next((f for f in files if 'rapistdata' in f and f.endswith('.txt')), None)

        # Is the patient/dataset valid
        if not (time_file and features_file and patient_labels_file and therapist_labels_file):
            continue
        cnt += 1

        time_path = os.path.join(root, time_file)
        features_path = os.path.join(root, features_file)
        patient_labels_path = os.path.join(root, patient_labels_file)
        therapist_labels_path = os.path.join(root, therapist_labels_file)

        # Load data
        time_data = pd.read_csv(time_path, sep='\s+', header=None)
        features_data = pd.read_csv(features_path, sep='\s+', header=None)
        patient_labels = pd.read_csv(patient_labels_path, sep='\s+', header=None, names=["time", "label"])
        therapist_labels = pd.read_csv(therapist_labels_path, sep='\s+', header=None, names=["time", "label"])

        # Assign column names
        features_data.columns = [
            'frame', 'confidence_azimuth', 'confidence_elevation', 'azimuth', 'elevation',
            'bbox_x1', 'bbox_y1', 'bbox_x2', 'bbox_y2', 'left_eye_x', 'left_eye_y',
            'right_eye_x', 'right_eye_y'
        ]

        # Calculate average eye positions
        features_data['eyes_x'] = (features_data['left_eye_x'] + features_data['right_eye_x']) / 2
        features_data['eyes_y'] = (features_data['left_eye_y'] + features_data['right_eye_y']) / 2
        features_data.drop(columns=['left_eye_x', 'left_eye_y', 'right_eye_x', 'right_eye_y'], inplace=True)

        # Otsu's threshold for person classification
        eyes_x_values = features_data['eyes_x'].to_numpy()
        eyes_x_threshold = threshold_otsu(eyes_x_values)
        features_data['person_id'] = features_data['eyes_x'].apply(lambda x: 'P' if x < eyes_x_threshold else 'T')

        # Process time_data
        time_data.columns = ['time'] + [f'col_{i}' for i in range(1, len(time_data.columns))]
        time_data['formatted_time'] = pd.to_datetime(time_data['time'], unit='s')

        # Subsample time data
        unique_times = time_data['time'].nunique()
        max_frames = features_data['frame'].max()
        subsampling_factor = max(1, unique_times // max_frames)
        time_data_subsampled = time_data.iloc[::subsampling_factor, :].reset_index(drop=True)

        # Merge features and subsampled time data
        merged_data = pd.merge_asof(
            features_data.sort_values('frame'),
            time_data_subsampled.sort_values('time'),
            left_on='frame',
            right_index=True,
            direction='nearest'
        )

        # Convert patient and therapist labels to datetime
        patient_labels['time'] = pd.to_datetime(patient_labels['time'],unit='s')
        therapist_labels['time'] = pd.to_datetime(therapist_labels['time'],unit='s')

        # Merge closest patient and therapist labels
        patient_labels_sorted = patient_labels.sort_values('time')
        therapist_labels_sorted = therapist_labels.sort_values('time')

        merged_data['patient_label'] = pd.merge_asof(
            merged_data[['formatted_time']].sort_values('formatted_time'),
            patient_labels_sorted,
            left_on='formatted_time',
            right_on='time',
            direction='nearest'
        )['label']

        merged_data['therapist_label'] = pd.merge_asof(
            merged_data[['formatted_time']].sort_values('formatted_time'),
            therapist_labels_sorted,
            left_on='formatted_time',
            right_on='time',
            direction='nearest'
        )['label']

        # Save or process merged data
        print(f"Processed data in {root}:")
        print(f"Unique times: {unique_times}, Max frames: {max_frames}")
        print(f"Subsampling factor: {subsampling_factor}")
        print(f"Otsu threshold for eyes_x: {eyes_x_threshold}")

        # Detect if there is a robot
        mark_robot = 'WR'
        if len(therapist_labels_sorted['label'].unique()) == 6:
            mark_robot = 'R'
            print("Robot detected in the dataset.")
        merged_data['use_robot'] = mark_robot


        # Strip non essential columns and add to final result
        # Select desired columns and rename 'patient_label'/'therapist_label' to 'labels'
        merged_data['label'] = merged_data.apply(
            lambda row: row['patient_label'] if row['person_id'] == 'P'
                   else row['therapist_label'], axis=1)

        merged_data = merged_data[['confidence_azimuth', 'confidence_elevation',
                                   'azimuth', 'elevation', 'bbox_x1', 'bbox_y1',
                                   'bbox_x2', 'bbox_y2', 'eyes_x', 'eyes_y',
                                   'use_robot', 'label', 'time', 'person_id']]

        # Reduntant, but matching the required specifications
        merged_data.rename(columns={'time': 'timestamp'}, inplace=True)
        merged_data['patient'] = f"Patient_{cnt}"

        training_set = pd.concat([training_set, merged_data], ignore_index=True)
        # Check the inscribed result
        # print(training_set)
    # Debug save and check manual
    training_set.to_csv(os.path.join(base_directory, 'training_set.csv'), index=False)

# Monut drive
from google.colab import drive
drive.mount('/content/drive')

# Locate your datebase as precisely as possible
base_directory = '/content/drive/MyDrive/Colab Notebooks/PatientTrainigData'
find_and_process_files(base_directory)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed data in /content/drive/MyDrive/Colab Notebooks/PatientTrainigData/Simeone_Dalfonso_robot:
Unique times: 928, Max frames: 179
Subsampling factor: 5
Otsu threshold for eyes_x: 498.53580078125003
Robot detected in the dataset.
Processed data in /content/drive/MyDrive/Colab Notebooks/PatientTrainigData/Simeone_Dalfonso_norobot:
Unique times: 445, Max frames: 83
Subsampling factor: 5
Otsu threshold for eyes_x: 494.273576171875
Robot detected in the dataset.
Processed data in /content/drive/MyDrive/Colab Notebooks/PatientTrainigData/Macchini_Zeppa_norobot-20241212T134922Z-001/Macchini_Zeppa_norobot:
Unique times: 451, Max frames: 90
Subsampling factor: 5
Otsu threshold for eyes_x: 498.950302734375
Processed data in /content/drive/MyDrive/Colab Notebooks/PatientTrainigData/Zeppa_Macchini_norobot/Zeppa_Macchini_norobot:
Unique times: 580, Max frames: 101
Su