In [1]:
from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath("..")) 

import numpy as np
import pandas as pd
import re

from scripts.get_paths import get_path

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
paths = get_path()

In [3]:
df = pd.read_csv(paths.features_2 / "extracted_features.csv", index_col=False)
df_boris_fd = pd.read_csv(paths.processed / "BORIS_annotations_FD_with_id.csv")
df_boris_vp = pd.read_csv(paths.processed / "BORIS_annotations_VP_with_id.csv")
df_boris_lk = pd.read_csv(paths.processed / "BORIS_annotations_LK_nodding_with_id.csv")

## Add annotations 

In [4]:
behaviors = ['gaze', 'smile']

# ---------- annotator 1 ----------
for b in behaviors:
    df[f'{b}_fd'] = 0

#print("\n⚠️ Checking annotator 1's annotations that did not map to any frame:\n")

for idx, row in df_boris_fd.iterrows():
    beh = str(row['Behavior']).strip().lower()
    if beh not in behaviors:
        continue

    start, stop = row['Start (s)'], row['Stop (s)']
    participant_id, clip = row['participant_id'], row['clip']

    mask = (
        (df['participant_id'] == participant_id) &
        (df['clip'] == clip) &
        (df['timestamp'] >= start) &
        (df['timestamp'] <= stop)
    )

    n_marked = mask.sum()

    if n_marked == 0:
        # print(
        #     f"FD MISS | pid={participant_id}, clip={clip}, "
        #     f"behavior={beh}, start={start:.3f}, stop={stop:.3f}, "
        #     f"duration={(stop-start):.3f}s"
        # )
        pass
    else:
        df.loc[mask, f'{beh}_fd'] = 1


# ---------- annotator 2 ----------
for b in behaviors:
    df[f'{b}_vp'] = 0

#print("\n⚠️ Checking annotator 2's annotations that did not map to any frame:\n")

for idx, row in df_boris_vp.iterrows():
    beh = str(row['Behavior']).strip().lower()
    if beh not in behaviors:
        continue

    start, stop = row['Start (s)'], row['Stop (s)']
    participant_id, clip = row['participant_id'], row['clip']

    mask = (
        (df['participant_id'] == participant_id) &
        (df['clip'] == clip) &
        (df['timestamp'] >= start) &
        (df['timestamp'] <= stop)
    )

    n_marked = mask.sum()

    if n_marked == 0:
        #print(
        #     f"VP MISS | pid={participant_id}, clip={clip}, "
        #     f"behavior={beh}, start={start:.3f}, stop={stop:.3f}, "
        #     f"duration={(stop-start):.3f}s"
        # )
        pass
    else:
        df.loc[mask, f'{beh}_vp'] = 1


In [5]:
NOD_WINDOW = 0.5  # seconds on either side

# make columns
df['nod_fd'] = 0
df['nod_vp'] = 0
df['nod_lk'] = 0

def mark_nod_point_events(df_ann, out_col, time_field='Start (s)'):
    """
    For each nod point event time t in df_ann[time_field]:
      - restrict to the matching participant_id+clip frames
      - find nearest frame timestamp to t
      - mark frames within [t_nearest - 0.5, t_nearest + 0.5] as 1 in out_col
    """
    for _, row in df_ann.iterrows():
        beh = str(row['Behavior']).strip().lower()
        if beh != 'nod' and beh != 'n':
            continue

        participant_id, clip = row['participant_id'], row['clip']
        t = row[time_field]  # point time; if you have a dedicated column, set time_field to it

        #time_col = get_time_col(participant_id, clip)

        gmask = (
            (df['participant_id'] == participant_id) &
            (df['clip'] == clip)
        )
        if not gmask.any():
            #print(f"{out_col} MISS (no frames) | pid={participant_id}, clip={clip}, t={t:.3f}")
            continue

        times = df.loc[gmask, 'timestamp'].to_numpy()
        if times.size == 0:
            #print(f"{out_col} MISS (no times) | pid={participant_id}, clip={clip}, t={t:.3f}")
            continue

        # nearest frame timestamp to event time
        nearest_idx_in_subset = np.argmin(np.abs(times - t))
        nearest_global_idx = df.loc[gmask].index[nearest_idx_in_subset]
        t_nearest = df.at[nearest_global_idx, 'timestamp']

        # mark ± window around nearest timestamp
        wmask = gmask & (df['timestamp'] >= (t_nearest - NOD_WINDOW)) & (df['timestamp'] <= (t_nearest + NOD_WINDOW))
        n_marked = int(wmask.sum())

        if n_marked == 0:
            #print(f"{out_col} MISS (window empty) | pid={participant_id}, clip={clip}, t={t:.3f}, t_nearest={t_nearest:.3f}")
            pass
        else:
            df.loc[wmask, out_col] = 1


#print("\n⚠️ Checking NOD point-events that did not map cleanly:\n")
mark_nod_point_events(df_boris_fd, 'nod_fd', time_field='Start (s)')
mark_nod_point_events(df_boris_vp, 'nod_vp', time_field='Start (s)')
mark_nod_point_events(df_boris_lk, 'nod_lk', time_field='Time')


## Add behvior_agreed column where annotators agree

In [6]:
behaviors = ["gaze", "smile"]  

# df_pyafar = df_pyafar.sort_values(["participant_id", "clip", "frame"]).reset_index(drop=True)

for behavior in behaviors:
    df[f"{behavior}_agreed"] = 0  # initialize once

# loop over each participant_id + clip
for (pid, clip), sub_idx in df.groupby(["participant_id", "clip"]).groups.items():
    # work on the sub-dataframe
    sub = df.loc[sub_idx].copy()

    # IMPORTANT: reset to 0..n-1 so your idx +/- 1 logic is safe
    sub = sub.reset_index()  # keeps original index in column named 'index'

    for behavior in behaviors:
        # overlap mask within this participant+clip only
        sub[f"{behavior}_overlap_mask"] = (
            (sub[f"{behavior}_fd"] == 1) &
            (sub[f"{behavior}_vp"] == 1)
        ).astype(int)

        overlap_series = (sub[f"{behavior}_overlap_mask"] != sub[f"{behavior}_overlap_mask"].shift()).cumsum()

        for series_id, group in sub.groupby(overlap_series):
            if group[f"{behavior}_overlap_mask"].iloc[0] == 1:
                num_overlap_frames = len(group)

                if num_overlap_frames >= 2:
                    strict_start_idx = group.index.min()
                    strict_end_idx   = group.index.max()

                    # backward expansion
                    current_start_idx = strict_start_idx
                    idx = strict_start_idx - 1
                    while idx >= 0:
                        fd_consecutive = (sub.loc[idx, f"{behavior}_fd"] == 1) and \
                                         (sub.loc[idx + 1, f"{behavior}_fd"] == 1)
                        vp_consecutive = (sub.loc[idx, f"{behavior}_vp"] == 1) and \
                                         (sub.loc[idx + 1, f"{behavior}_vp"] == 1)

                        if fd_consecutive or vp_consecutive:
                            current_start_idx = idx
                            idx -= 1
                        else:
                            break

                    # forward expansion
                    current_end_idx = strict_end_idx
                    idx = strict_end_idx + 1
                    while idx < len(sub):
                        fd_consecutive = (sub.loc[idx, f"{behavior}_fd"] == 1) and \
                                         (sub.loc[idx - 1, f"{behavior}_fd"] == 1)
                        vp_consecutive = (sub.loc[idx, f"{behavior}_vp"] == 1) and \
                                         (sub.loc[idx - 1, f"{behavior}_vp"] == 1)

                        if fd_consecutive or vp_consecutive:
                            current_end_idx = idx
                            idx += 1
                        else:
                            break

                    sub.loc[current_start_idx:current_end_idx, f"{behavior}_agreed"] = 1

        sub = sub.drop(columns=[f"{behavior}_overlap_mask"])

    # write back to the big df using the preserved original index
    df.loc[sub["index"].values, [f"{b}_agreed" for b in behaviors]] = \
        sub[[f"{b}_agreed" for b in behaviors]].values


In [7]:
behaviors = ["nod"] # majority vote for nod_agreed

# name your annotators
annotators = ["vp", "fd", "lk"]  # adjust to your real suffixes

# initialize agreed columns once
for behavior in behaviors:
    df[f"{behavior}_agreed"] = 0

# loop over each participant_id + clip
for (pid, clip), sub_idx in df.groupby(["participant_id", "clip"]).groups.items():
    sub = df.loc[sub_idx].copy()

    # reset to 0..n-1 so idx +/- 1 is safe; keep original index
    sub = sub.reset_index()  # original index stored in 'index'

    for behavior in behaviors:
        cols = [f"{behavior}_{ann}" for ann in annotators]

        # overlap mask: at least 2 of 3 annotators labeled 1 on that frame
        sub[f"{behavior}_overlap_mask"] = (sub[cols].sum(axis=1) >= 2).astype(int)

        # contiguous series ids for runs of 0s and 1s
        overlap_series = (
            sub[f"{behavior}_overlap_mask"].ne(sub[f"{behavior}_overlap_mask"].shift())
        ).cumsum()

        for series_id, group in sub.groupby(overlap_series):
            if group[f"{behavior}_overlap_mask"].iloc[0] == 1:
                num_overlap_frames = len(group)

                # keep your "strict overlap must be >= 2 frames" rule
                if num_overlap_frames >= 2:
                    strict_start_idx = group.index.min()
                    strict_end_idx   = group.index.max()

                    # backward expansion:
                    # extend if ANY annotator has consecutive 1s across the boundary
                    current_start_idx = strict_start_idx
                    idx = strict_start_idx - 1
                    while idx >= 0:
                        any_consecutive = False
                        for c in cols:
                            if (sub.loc[idx, c] == 1) and (sub.loc[idx + 1, c] == 1):
                                any_consecutive = True
                                break

                        if any_consecutive:
                            current_start_idx = idx
                            idx -= 1
                        else:
                            break

                    # forward expansion:
                    current_end_idx = strict_end_idx
                    idx = strict_end_idx + 1
                    while idx < len(sub):
                        any_consecutive = False
                        for c in cols:
                            if (sub.loc[idx, c] == 1) and (sub.loc[idx - 1, c] == 1):
                                any_consecutive = True
                                break

                        if any_consecutive:
                            current_end_idx = idx
                            idx += 1
                        else:
                            break

                    sub.loc[current_start_idx:current_end_idx, f"{behavior}_agreed"] = 1

        sub = sub.drop(columns=[f"{behavior}_overlap_mask"])

    # write back using preserved original index
    agreed_cols = [f"{b}_agreed" for b in behaviors]
    df.loc[sub["index"].values, agreed_cols] = sub[agreed_cols].values


In [8]:
df.to_csv(paths.features_2 / "extracted_features.csv", index=False)