In [None]:
import os
import sys
import pandas as pd

In [None]:
def time_to_seconds(time_str):
    minutes, seconds = time_str.split(':')
    return int(minutes) * 60 + float(seconds)

In [None]:
#todo: add columns for timestamps in seconds:
# df['seconds'] = df['timestamp'].apply(time_to_seconds)

def split_csv_by_column(input_csv, trial_col, 
                        member_col, output_dir,
                        included_trials = None):
    os.makedirs(output_dir, exist_ok=True)

    df = pd.read_csv(input_csv)

    if trial_col not in df.columns or member_col not in df.columns:
        raise ValueError(f"One or both columns ('{trial_col}', '{member_col}') not found in CSV.")

    if included_trials is not None:
        df = df[df[trial_col].isin(included_trials)]

    grouped = df.groupby(trial_col)


    complete_trials = []
    
    for trial_val, trial_df in grouped:
        # Get unique values from member_col in this group
        unique_members = trial_df[member_col].dropna().unique()
        unique_members = [str(name).strip() for name in unique_members if str(name).strip()]
        # print(f"Trial {trial_val} has {len(unique_members)} members: {unique_members}")
        

        #ToDo: ensure audio and transcript file names have all relevant strings, and look alike

        # Get names of members, check if all three are present, add to list
        # and save to file
        if len(unique_members) == 3:
            complete_trials.append([trial_val, unique_members])
            # member_id = "-".join(unique_members)
            # expected filename:
            # HSRData_ClientAudio_Trial-T000603_Team-TM000202_Member-NA.csv
            # create a filename and save complete dataframe as csv file:
            output_filename = f"HSRData_ClientAudio_Trial-{trial_val}_Team-NA_Member-NA.csv"
            output_path = os.path.join(output_dir, output_filename)
            trial_df.to_csv(output_path, index=False)
            print(f"Saved: {output_path}")
            
        else:
            # Disregard incomplete trials:
            print(f"trial {trial_val} is incomplete.")

    print(f"There are {len(complete_trials)} complete trials.")
    for trial_info in complete_trials:
        print(f"trial: {trial_info[0]}, members: {trial_info[1]}")

In [31]:
# === Example usage ===
split_csv_by_column(
    input_csv="./multicat.csv",
    trial_col="trial",         # Column to split the CSV on
    member_col="participant",
    # included_groups = []      # Column to use for building the filename
    output_dir='split_output'     # Output directory
)


trial T000602 is incomplete.
Saved: split_output/HSRData_ClientAudio_Trial-T000603_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000604_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000605_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000606_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000607_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000608_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000609_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000610_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000611_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000612_Team-NA_Member-NA.csv
Saved: split_output/HSRData_ClientAudio_Trial-T000613_Team-NA_Member-NA.csv
trial T000614 is incomplete.
trial T000619 is incomplete.
Saved: split_output/HSRData_ClientAudio_Trial-T000620_Team-NA_Member-NA.csv
S