In [1]:
from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath("..")) 

import numpy as np
import pandas as pd
import re

from scripts.get_paths import get_path

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
paths = get_path()

In [3]:
# Get the list of participants and the ground truth labels 
GT_no_label = pd.read_excel(Path(paths.processed / "annotations_FD_V2.xlsx"))
participants = GT_no_label['video_id'].unique().tolist()
GT_with_label = pd.read_excel(Path(paths.processed / "SIT_Vpn_240909_FD_V2.xlsx"), sheet_name="Data")

In [4]:
df = pd.read_csv(paths.features_2 / "extracted_features.csv", index_col=False) 

### Add NT (Neurotypical) column

In [5]:
annotations_df = GT_no_label[["label_id", "video_id"]]
labels_df = GT_with_label[["Id", "NT"]].drop_duplicates()

nt_df = annotations_df.merge(labels_df, left_on="label_id", right_on="Id", how="left")

df = df.merge(
    nt_df,
    left_on="participant_id",
    right_on="video_id",
    how="left"
)

##### Remove parts not considered for analysis (debrief and explanation) and add columns 'part' and 'speaker'

In [6]:
SITVersion = {
    'ONLINE': {
        '0': [
            {'name': 'explanation', 'speaker': 'actress', 'start': None, 'end': 27},
            {'name': 'picture', 'speaker': 'actress', 'start': 27, 'end': 42},
            {'name': 'explanation', 'speaker': 'actress', 'start': 42, 'end': None},
        ],
        '1': [{'name': 'picture', 'speaker': 'participant', 'start': None, 'end': None}],
        '2': [{'name': 'explanation', 'speaker': 'actress', 'start': None, 'end': None}],
        '3': [{'name': 'liked food', 'speaker': 'actress', 'start': None, 'end': None}],
        '4': [{'name': 'liked food', 'speaker': 'participant', 'start': None, 'end': None}],
        '5': [{'name': 'disliked food', 'speaker': 'actress', 'start': None, 'end': None}],
        '6': [{'name': 'disliked food', 'speaker': 'participant', 'start': None, 'end': None}],
        '7': [{'name': 'explanation', 'speaker': 'actress', 'start': None, 'end': None}],
        '8': [{'name': 'debriefing', 'speaker': 'actress', 'start': None, 'end': None}],
    }
}
SITVersion_Length = {
    'ONLINE': {
        '0': 53,
        '1': 31,
        '2': 62,
        '3': 22,
        '4': 32,
        '5': 22,
        '6': 32,
        '7': 15,
        '8': 41,
    }
}

In [7]:
KEEP_PARTS = {"picture", "liked food", "disliked food"}
KEEP_SPEAKERS = {"actress", "participant"}

In [8]:
def pre_process(
    df: pd.DataFrame,
    SITVersion: dict,
    condition: str = "ONLINE",
    clip_col: str = "clip_id",
    ts_col: str = "timestamp",
    inclusive: str = "both",
) -> pd.DataFrame:
    """
    1) Add 'part' and 'speaker' columns from SITVersion (mapped by clip id).
    2) Filter to KEEP_PARTS/KEEP_SPEAKERS.
    3) Apply start/end windows per clip for kept parts (None means open interval).
    """

    spec = SITVersion[condition]

    out = df.copy()
    out["_clip_str"] = out[clip_col].astype(str)

    # ---- Build per-clip windows AND per-clip labels (part/speaker) ----
    allowed_windows = {}   # clip -> list[(start,end)]
    part_map = {}          # clip -> part name
    speaker_map = {}       # clip -> speaker

    for clip_id, segments in spec.items():
        clip_id = str(clip_id)
        windows = []
        chosen_part = None
        chosen_speaker = None

        for seg in segments:
            name = seg.get("name")
            spk = seg.get("speaker")

            if name in KEEP_PARTS:
                # labels
                if chosen_part is None:
                    chosen_part = name
                    chosen_speaker = spk

                # windows
                start = seg.get("start", None)
                end = seg.get("end", None)
                windows.append((start, end))

        if windows:
            allowed_windows[clip_id] = windows
        if chosen_part is not None:
            part_map[clip_id] = chosen_part
            speaker_map[clip_id] = chosen_speaker

    # ---- Add columns for ALL rows (will be NaN if clip not in SITVersion) ----
    out["part"] = out["_clip_str"].map(part_map)
    out["speaker"] = out["_clip_str"].map(speaker_map)

    # ---- Filter by part/speaker first ----
    out = out[out["part"].isin(KEEP_PARTS) & out["speaker"].isin(KEEP_SPEAKERS)]

    # ---- Then filter by time windows (only for clips we kept) ----
    out = out[out["_clip_str"].isin(allowed_windows.keys())]

    def in_any_window(clip_str: str, t: float) -> bool:
        for start, end in allowed_windows[clip_str]:
            if start is None:
                left_ok = True
            else:
                left_ok = (t >= start) if inclusive in ("both", "left") else (t > start)

            if end is None:
                right_ok = True
            else:
                right_ok = (t <= end) if inclusive in ("both", "right") else (t < end)

            if left_ok and right_ok:
                return True
        return False

    mask = out.apply(lambda r: in_any_window(r["_clip_str"], r[ts_col]), axis=1)
    out = out[mask].drop(columns=["_clip_str"])

    return out


In [9]:
df = pre_process(df, SITVersion, condition="ONLINE",
                                 clip_col="clip", ts_col="timestamp")

In [10]:
df.to_csv(paths.features_2 / "extracted_features.csv", index=False)