The following libraries were used to process the videos and generate full feature outputs. In this code, we only select and use the features relevant to our analysis from their generated results.

1. Openface: frame, timestamp, gaze_angle_x, gaze_angle_y, pose_Rx(pitch_openface), pose_Ry(yaw_openface), pose_Rz(roll_openface), AU12_c
2. PyAFAR: frame, Pitch(pitch_pyafar), Yaw(yaw_pyafar), Roll(roll_pyafar), Occ_au_12(smile_pyafar_pre), Occ_au_4, Occ_au_6
3. Rehg: frame, eye-contact-score(gaze_pyafar_pre)



In [1]:
from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath("..")) 

import numpy as np
import pandas as pd
import re
from scripts.get_paths import get_path

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
paths = get_path()

In [3]:
GT_no_label = pd.read_excel(Path(paths.processed / "annotations_FD_V2.xlsx"))
participants = GT_no_label['video_id'].unique().tolist()

#### For features extacted from libraries - select relevant features and merge into one csv file 

## [PyAFAR](https://github.com/AffectAnalysisGroup/PyAFAR)

In [4]:
# ---------- CONFIG ----------
folder = Path(paths.features_2 / "pyafar_infant_features")   
out_csv = Path(paths.features_2 / "pyafar_infant_features_merged.csv")

wanted_cols = ["Frame", "Pitch", "Yaw", "Roll", "Occ_au_12", "Occ_au_4","Occ_au_6"]
clip_min, clip_max = 0, 6
# ---------------------------

pattern = re.compile(r"^(?P<pid>\d+)_video_(?P<clip>\d+)\.csv$")

all_rows = []
skipped = []

for fp in folder.glob("*.csv"):
    m = pattern.match(fp.name)
    if not m:
        continue

    pid = int(m.group("pid"))
    clip = int(m.group("clip"))

    if pid not in participants:
        continue
    if not (clip_min <= clip <= clip_max):
        continue

    try:
        df = pd.read_csv(fp)
    except Exception as e:
        skipped.append((fp.name, f"read_error: {e}"))
        continue

    # select only columns that exist
    available = [c for c in wanted_cols if c in df.columns]
    missing = [c for c in wanted_cols if c not in df.columns]

    if not available:
        skipped.append((fp.name, f"no wanted cols found. missing={missing}"))
        continue

    df = df[available].copy()
    df["participant_id"] = pid
    df["clip"] = clip

    # optional: keep column order consistent
    final_cols = available + ["participant_id", "clip"]
    df = df[final_cols]

    all_rows.append(df)

if not all_rows:
    raise RuntimeError("No matching files/rows found. Check folder, participants list, and clip range.")

merged = pd.concat(all_rows, ignore_index=True)

# optional: sort
merged = merged.sort_values(["participant_id", "clip"] + (["Frame"] if "Frame" in merged.columns else []),
                            kind="stable")

merged.to_csv(out_csv, index=False)

print(f"Rows: {len(merged):,} | Files merged: {len(all_rows)}")
if skipped:
    print("\nSkipped files (first 20):")
    for name, reason in skipped[:20]:
        print(f"  - {name}: {reason}")


Rows: 256,080 | Files merged: 231


In [5]:
merged_pyafar = pd.read_csv(out_csv, index_col=False)
merged_pyafar["Frame"] = merged_pyafar["Frame"] + 1 # pyafar starts at 0, but we want to start at 1 to match OpenFace
merged_pyafar = merged_pyafar.rename(columns={
    "Frame":"frame",
    "Pitch": "pitch_pyafar",
    "Yaw": "yaw_pyafar",
    "Roll": "roll_pyafar",
    "Occ_au_12": "smile_pyafar_pre" # 'pre' because pyafar returns likelihood of AU being present, which we convert later to binary using a finetuned threshold
})
merged_pyafar.columns

Index(['frame', 'pitch_pyafar', 'yaw_pyafar', 'roll_pyafar',
       'smile_pyafar_pre', 'Occ_au_4', 'Occ_au_6', 'participant_id', 'clip'],
      dtype='object')

## [OpenFace](https://github.com/TadasBaltrusaitis/OpenFace)

In [6]:
# ---------- CONFIG ----------
folder = Path(paths.features_1 / "open_face_features")  
out_csv = Path(paths.features_2 / "openface_features_merged.csv")

wanted_cols = ["frame", "timestamp", "confidence", "gaze_angle_x", "gaze_angle_y", "pose_Rx", "pose_Ry", "pose_Rz", "AU12_c","AU12_r"]
clip_min, clip_max = 0, 6
# ---------------------------

pattern = re.compile(r"^(?P<pid>\d+)_video_(?P<clip>\d+)\.csv$")

all_rows = []
skipped = []

for fp in folder.glob("*.csv"):
    m = pattern.match(fp.name)
    if not m:
        continue

    pid = int(m.group("pid"))
    clip = int(m.group("clip"))

    if pid not in participants:
        continue
    if not (clip_min <= clip <= clip_max):
        continue

    try:
        df = pd.read_csv(fp)
    except Exception as e:
        skipped.append((fp.name, f"read_error: {e}"))
        continue

    # select only columns that exist
    available = [c for c in wanted_cols if c in df.columns]
    missing = [c for c in wanted_cols if c not in df.columns]

    if not available:
        skipped.append((fp.name, f"no wanted cols found. missing={missing}"))
        continue

    df = df[available].copy()
    df["participant_id"] = pid
    df["clip"] = clip

    # optional: keep column order consistent
    final_cols = available + ["participant_id", "clip"]
    df = df[final_cols]

    all_rows.append(df)

if not all_rows:
    raise RuntimeError("No matching files/rows found. Check folder, participants list, and clip range.")

merged = pd.concat(all_rows, ignore_index=True)

# optional: sort
merged = merged.sort_values(["participant_id", "clip"] + (["Frame"] if "Frame" in merged.columns else []),
                            kind="stable")

merged.to_csv(out_csv, index=False)

print(f"Rows: {len(merged):,} | Files merged: {len(all_rows)}")
if skipped:
    print("\nSkipped files (first 20):")
    for name, reason in skipped[:20]:
        print(f"  - {name}: {reason}")


Rows: 255,976 | Files merged: 231


In [7]:
merged_openface = pd.read_csv(out_csv, index_col=False)
merged_openface = merged_openface.rename(columns={
    "pose_Rx": "pitch_openface",
    "pose_Ry": "yaw_openface",
    "pose_Rz": "roll_openface",
    "AU12_c": "smile_openface"
})
merged_openface.columns

Index(['frame', 'timestamp', 'confidence', 'gaze_angle_x', 'gaze_angle_y',
       'pitch_openface', 'yaw_openface', 'roll_openface', 'smile_openface',
       'AU12_r', 'participant_id', 'clip'],
      dtype='object')

## [Rehg eye gaze library](https://github.com/rehg-lab/eye-contact-cnn)

In [8]:

# ---------- CONFIG ----------
folder = Path(paths.features_2 / "gaze_rehg_features")   
out_csv = Path(paths.features_2 / "rehg_eye_contact_merged.csv")

clip_min, clip_max = 0, 6
# ---------------------------

pattern = re.compile(r"^(?P<pid>\d+)_video_(?P<clip>\d+)_output\.txt$")

all_rows = []
skipped = []

for fp in folder.glob("*.txt"):
    m = pattern.match(fp.name)
    if not m:
        continue

    pid = int(m.group("pid"))
    clip = int(m.group("clip"))

    if pid not in participants:
        continue
    if not (clip_min <= clip <= clip_max):
        continue

    try:
        # Files look like: "1,0.939229" (no header)
        df = pd.read_csv(fp, header=None, names=["Frame", "eye-contact-score"])
    except Exception as e:
        skipped.append((fp.name, f"read_error: {e}"))
        continue

    # Basic cleanup: drop empty/NaN lines, coerce types
    df = df.dropna(how="any")
    df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
    df["eye-contact-score"] = pd.to_numeric(df["eye-contact-score"], errors="coerce")
    df = df.dropna(subset=["Frame", "eye-contact-score"])

    df["participant_id"] = pid
    df["clip"] = clip

    df = df[["Frame", "eye-contact-score", "participant_id", "clip"]]
    all_rows.append(df)

if not all_rows:
    raise RuntimeError("No matching files/rows found. Check folder, participants list, and clip range.")

merged = pd.concat(all_rows, ignore_index=True)
merged = merged.sort_values(["participant_id", "clip", "Frame"], kind="stable")

merged.to_csv(out_csv, index=False)

print(f"Rows: {len(merged):,} | Files merged: {len(all_rows)}")
if skipped:
    print("\nSkipped files (first 20):")
    for name, reason in skipped[:20]:
        print(f"  - {name}: {reason}")


Rows: 256,079 | Files merged: 231


In [9]:
merged_rehg = pd.read_csv(out_csv, index_col=False)
merged_rehg = merged_rehg.rename(columns={
    "Frame": "frame",
    "eye-contact-score": "gaze_pyafar_pre" # 'pre' because we will convert this to binary using a finetuned threshold later
})
merged_rehg.columns

Index(['frame', 'gaze_pyafar_pre', 'participant_id', 'clip'], dtype='object')

## Merge

In [10]:
KEYS = ["participant_id", "clip", "frame"]

def mismatch_report(a, b, a_name, b_name):
    a_keys = a[KEYS].drop_duplicates()
    b_keys = b[KEYS].drop_duplicates()

    ab = a_keys.merge(b_keys, on=KEYS, how="outer", indicator=True)

    only_a = ab[ab["_merge"] == "left_only"].drop(columns="_merge")
    only_b = ab[ab["_merge"] == "right_only"].drop(columns="_merge")

    # Summaries per participant_id+clip
    only_a_sum = (only_a.groupby(["participant_id", "clip"])["frame"]
                  .agg(n_missing=f"count", min_frame="min", max_frame="max")
                  .reset_index()
                  .sort_values(["participant_id", "clip"]))
    only_b_sum = (only_b.groupby(["participant_id", "clip"])["frame"]
                  .agg(n_missing=f"count", min_frame="min", max_frame="max")
                  .reset_index()
                  .sort_values(["participant_id", "clip"]))

    #print(f"\n=== Frames present in {a_name} but missing in {b_name} ===")
    # if only_a.empty:
    #     print("None ✅")
    # else:
    #     print(only_a_sum.to_string(index=False))

    # print(f"\n=== Frames present in {b_name} but missing in {a_name} ===")
    # if only_b.empty:
    #     print("None ✅")
    # else:
    #     print(only_b_sum.to_string(index=False))

    return only_a, only_b

# Pairwise mismatch reporting (you can comment any out)
missing_pyafar_vs_openface, missing_openface_vs_pyafar = mismatch_report(
    merged_pyafar, merged_openface, "pyafar", "openface"
)
missing_pyafar_vs_rehg, missing_rehg_vs_pyafar = mismatch_report(
    merged_pyafar, merged_rehg, "pyafar", "rehg"
)
missing_openface_vs_rehg, missing_rehg_vs_openface = mismatch_report(
    merged_openface, merged_rehg, "openface", "rehg"
)


In [11]:
merged_inner = (
    merged_pyafar
      .merge(merged_openface, on=KEYS, how="inner")
      .merge(merged_rehg,     on=KEYS, how="inner")
)

print("Merged (inner, fully aligned) rows:", len(merged_inner))


Merged (inner, fully aligned) rows: 255975


In [12]:
merged_inner.to_csv(paths.features_2 / "extracted_features.csv", index=False)