# Get Ground Truth Labels for Plant Data
---
We are going to use labels for participants' emotions extracted from videos taken in parallel to the plant recordings. The emotions have already been predicted based on the facial expressions and what we are going to do is to get the labels and use them as ground truth for our plant experiments. 

In [None]:
import pandas as pd
import os
from typing import Union, Tuple, Literal

In [None]:
emotions_dir = "../data/teamwork-emotions"
interim_data_dir = "../data/interim-plant-data-teamwork-extracted"

logs_dir = "../logs"

In [None]:
save_file: bool = True

# Team names. 
teams = [
    "team_01",
    "team_02",
    "team_03",
    "team_04",
    "team_05",
    "team_06",
    "team_07",
    "team_08",
    "team_09",
    "team_10",
    "team_11",
    "team_12",
    "team_13",
    "team_15",
    "team_16",
    "team_17",
    "team_18",
    "team_19",
    "team_20",
    "team_22"
]

# Teamworking days. 
days = ["2023-01-10", "2023-01-12", "2023-01-13"]

As a matter of fact, from the teamwork session interval extraction I got "broken" `.wav` files, meaning that I am not going to use them. I manually changed the folder names so that I know which teams I am going to exclude from the experiment. Valid folder names are "team_01" (without any addition), and folders to be ignored have a longer name that indicates the reason for exclusion, e.g. "team_03_broken_because_44100_samplingrate_on_day1".

In [None]:
def custom_sort(label: str, mode: Literal["interim", "emotions"] = "interim") -> Union[int, Tuple[int, int]]:
    """
    Helper function for customized sorting of file labels in emotions folder or interim data folder.
    emotions: Sort labels like "clip_0_11509_11908.csv" first by clip id (0) and second by start frame (11509).
    interim: Sort labels like "sdm_2023-01-10_team_01_8333_9490.wav" first by clip id (0) and second by start
    frame (11509).
    """

    if mode == "interim":
        parts = label.split('_')
        return int(parts[4])
    elif mode == "emotions":
        parts = label.split('_')
        return int(parts[1]), int(parts[2])

In [None]:
def get_duration_from_label(label: str, mode: Literal["interim", "emotions"] = "interim") -> int:
    """
    Compute the duration of the teamwork session based on the start and end frame in the corresponding label.
    emotions: "clip_0_11509_11908.csv", i.e. 11908-11509.
    interim: "sdm_2023-01-10_team_01_8333_9490.wav", i.e. 8333-9490.
    """
    
    if mode == "interim":
        parts = label.split('.')[0].split("_")
        duration = int(parts[5]) - int(parts[4])
        return duration
    elif mode == "emotions":
        parts = label.split('.')[0].split("_")
        duration = int(parts[3]) - int(parts[2])
        return duration

In [None]:
emotion_signal_mappings = pd.DataFrame(columns=['path_emotions', 'duration_emotions', "path_interim",
                                                "duration_interim", 'difference'])

for t in teams:
    for d in days:
        interim_data_path = os.path.join(interim_data_dir, t, d)
        emotions_path = os.path.join(emotions_dir, t, d)

        if os.path.exists(interim_data_path) and os.path.exists(emotions_path):
            print(f"emotion: {emotions_path}")
            print(f"interim: {interim_data_path}")

            # 1. Files with emotions per second
            clip_files = os.listdir(emotions_path)
            clip_files = [item for item in clip_files if not item.startswith('team')]  # remove item "team_1...csv"
            print(f"{len(clip_files)}: {clip_files}")

            # lambda function needed because otherwise I could not use self-implemented custom_sort because it takes more than 1 arguments.
            clip_files = sorted(clip_files, key=lambda x: custom_sort(x, mode="emotions"))

            # 2. Files with interim plant teamwork signal data
            interim_data_files = os.listdir(interim_data_path)
            print(f"{len(interim_data_files)}: {interim_data_files}\n")

            interim_data_files = sorted(interim_data_files, key=custom_sort)
            
            for i in range(len(clip_files)):
                print(clip_files[i],"\t",interim_data_files[i])
                print(get_duration_from_label(clip_files[i], mode="emotions"),"\t\t\t",get_duration_from_label(interim_data_files[i]))

                emotion_signal_mappings.loc[len(emotion_signal_mappings)] = \
                    [os.path.join(t,d,clip_files[i]),get_duration_from_label(clip_files[i], mode="emotions"),
                     interim_data_files[i],get_duration_from_label(interim_data_files[i]),
                     get_duration_from_label(interim_data_files[i])-get_duration_from_label(clip_files[i], mode="emotions")]
            print("")            
    print("___________________________________")

In [None]:
#mismatch_logs.to_excel(os.path.join(logs_dir,"duration_comparison_teamwork_session.xlsx"))

Check number of non-zero elements in column `difference` as this shows that the durations indicated in two corresponding strings is not the same. Number of mismatches is equal to 12. 

In [None]:
df_bool = (emotion_signal_mappings["difference"] != 0)
df_bool

In [None]:
print(df_bool.sum())