# **Dataset Builder**

**NOTE: the videos and the infernce csv (created by the model once you the code) will be stored in your personal drives. It is not compatible with shared drive paths.**

In [None]:
MAX_VIDEOS = 1
WINDOW_SIZE = 10

## **Read videos from Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**TODO:** Make Sure that folder has no more than 10 videos. If the notebook gives errors about limits, try lesser number of videos. Toggle the number of videos by changing MAX_VIDEOS variable at the top of the notebook.

**TODO:** Make the following 3 folders in "My Drive" (in Google Drive):
1. Dataset Videos
2. Dataset CSV
3. Combined Datasets

**TODO:** Change the file path in the code cell below, according to which batch of videos you are running and how you organized your folder.

In [None]:
folder_path = '/content/drive/MyDrive/[Personal] AI4Good Lab/Training'

In [None]:
import os

try:
    all_files = os.listdir(folder_path)
    file_paths = [os.path.join(folder_path, f) for f in all_files if os.path.isfile(os.path.join(folder_path, f))]

    if len(file_paths) > MAX_VIDEOS:
        raise ValueError(f"Too many videos loaded: {len(file_paths)} exceeds the allowed limit of {MAX_VIDEOS}.")

except ValueError as e:
    print("Error:", e)
    file_paths = []  # Optional: clear file_paths to prevent downstream processing

file_paths

['/content/drive/MyDrive/[Personal] AI4Good Lab/Training/NP_unfocused_no_AUs_1_left.mp4']

In [None]:
folder_path = '/content/drive/MyDrive/Dataset CSV'
all_inference_files = os.listdir(folder_path)
inference_file_paths = [os.path.join(folder_path, f) for f in all_files if os.path.isfile(os.path.join(folder_path, f))]
inference_file_paths

[]

## Process Videos extracted from Drive

### Helper functions ((No need to run again if already run once and the session has not expired)

In [None]:
def install_requirements():
  requirements = """
  torch>=2.0
  torchvision
  torchaudio
  opencv-python
  pillow
  numpy>=1.26.0,<2.1.0
  gdown
  huggingface-hub>=0.30.0,<1.0.0
  tensorboardX
  """.strip()

  with open("/content/OpenFace-3.0/requirements.txt", "w") as f:
      f.write(requirements)

  !pip install -r /content/OpenFace-3.0/requirements.txt
  !pip install -q gdown

  !gdown --id 1DP4gsbRtA9fD4zYC6vzfj2sL1sLy3iVu -O openface.zip
  !mv /content/openface.zip /content/OpenFace-3.0/

  !unzip -q /content/OpenFace-3.0/openface.zip -d /content/OpenFace-3.0/

  !sed -i 's/from scipy.integrate import simps/from scipy.integrate import simpson as simps/' /content/OpenFace-3.0/openface/STAR/lib/metric/fr_and_auc.py
  !grep 'scipy.integrate' /content/OpenFace-3.0/openface/STAR/lib/metric/fr_and_auc.py

  !mkdir -p /content/openface_weights

  !gdown --folder https://drive.google.com/drive/folders/1aBEol-zG_blHSavKFVBH9dzc9U9eJ92p -O /content/openface_weights



In [None]:
def get_gitrepo():
  !rm -rf /content/OpenFace-3.0
  !git clone https://github.com/CMU-MultiComp-Lab/OpenFace-3.0.git
  # %cd OpenFace-3.0

  !rm -rf /content/OpenFace-3.0/STAR
  !rm -rf /content/OpenFace-3.0/Pytorch_Retinaface/

  !mkdir -p /content/OpenFace-3.0

  # Clone RetinaFace directly into parent folder
  !git clone https://github.com/biubug6/Pytorch_Retinaface.git /content/OpenFace-3.0/Pytorch_Retinaface

  # Repeat with STAR repo
  !mkdir -p /content/OpenFace-3.0
  !git clone https://github.com/ZhenglinZhou/STAR.git /content/OpenFace-3.0/STAR

  install_requirements()

In [None]:
def load_model():
    import torch
    from openface.face_detection import FaceDetector
    from openface.landmark_detection import LandmarkDetector
    from openface.multitask_model import MultitaskPredictor

    weights_dir = '/content/openface_weights'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    face_model_path = f'{weights_dir}/Alignment_RetinaFace.pth'
    landmark_model_path = f'{weights_dir}/Landmark_98.pkl'
    multitask_model_path = f'{weights_dir}/MTL_backbone.pth'

    face_detector = FaceDetector(model_path=face_model_path, device=device)

    if device == 'cuda':
        landmark_detector = LandmarkDetector(model_path=landmark_model_path, device=device, device_ids=[0])
    else:
        landmark_detector = LandmarkDetector(model_path=landmark_model_path, device=device)

    multitask_model = MultitaskPredictor(model_path=multitask_model_path, device=device)

    return face_detector, landmark_detector, multitask_model

DEF PROCESS_VIDEO THAT INCLUDES FLIPPING FOR MIRRORED VIDEOS

In [None]:
def process_video(file_path, face_detector, landmark_detector, multitask_model, id, user_trigger, flip=False):
    print("\n\n-------------------------------------------------------------------\n\n")
    print(f"Processing video: {file_path}")

    # Step 1: If flip is requested, create a temporary flipped video
    if flip:
        flipped_path = f"/content/flipped_{os.path.basename(file_path)}"
        flip_video_horizontally(file_path, flipped_path)
        file_path = flipped_path  # use the flipped one from now on
        print(f"⚠️ Used flipped video: {file_path}")

    # Step 2: Continue with normal processing
    cap = cv2.VideoCapture(file_path)
    if not cap.isOpened():
        raise Exception(f"Failed to open video: {file_path}")

    fps = cap.get(cv2.CAP_PROP_FPS)
    trigger_point = user_trigger

    results = []
    frame_idx = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        timestamp = time.time()
        frame_path = f"/content/frame_{frame_idx:05d}.jpg"
        cv2.imwrite(frame_path, frame)

        try:
            cropped_face, dets = face_detector.get_face(frame_path)

            if cropped_face is not None and dets is not None:
                landmarks = landmark_detector.detect_landmarks(frame, dets)

                try:
                    emotion_logits, gaze_output, au_output = multitask_model.predict(cropped_face)
                    emotion_idx = torch.argmax(emotion_logits, dim=1).item()

                    results.append({
                        'id': id,
                        'timestamp': timestamp,
                        'image_path': frame_path,
                        'face_id': 0,
                        'face_detection': dets[0].tolist(),
                        'landmarks': landmarks[0].tolist() if landmarks else None,
                        'emotion': emotion_idx,
                        'gaze_yaw': gaze_output[0][0].item(),
                        'gaze_pitch': gaze_output[0][1].item(),
                        'action_units': au_output.tolist()
                    })

                except RuntimeError as e:
                    if 'CUDA out of memory' in str(e) or 'cuda' in str(e).lower():
                        print(f"GPU error encountered on frame {frame_idx}: {e}")
                        torch.cuda.empty_cache()
                        break
                    else:
                        raise e

        except Exception as e:
            print(f"Skipping frame {frame_idx} due to error: {e}")

        frame_idx += 1

    cap.release()

    if results:
        metadata = pd.DataFrame({
            'Attribute': ['FPS', 'Trigger Point'],
            'Value': [fps, trigger_point]
        })

        df_results = pd.DataFrame(results)
        csv_path = f"/content/drive/MyDrive/Dataset CSV/{id}.csv"

        with open(csv_path, 'w') as f:
            metadata.to_csv(f, index=False)
            f.write('\n')
            df_results.to_csv(f, index=False)

        print(f"Processed {frame_idx} frames.")
    else:
        print("No usable frames were processed. Skipping save.")


### Get Git Repo (No need to run again if already run once and the session has not expired)

In [None]:
get_gitrepo()

Cloning into 'OpenFace-3.0'...
remote: Enumerating objects: 215, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 215 (delta 29), reused 14 (delta 14), pack-reused 176 (from 1)[K
Receiving objects: 100% (215/215), 105.17 MiB | 15.10 MiB/s, done.
Resolving deltas: 100% (77/77), done.
Cloning into '/content/OpenFace-3.0/Pytorch_Retinaface'...
remote: Enumerating objects: 123, done.[K
remote: Total 123 (delta 0), reused 0 (delta 0), pack-reused 123 (from 1)[K
Receiving objects: 100% (123/123), 6.81 MiB | 3.92 MiB/s, done.
Resolving deltas: 100% (41/41), done.
Cloning into '/content/OpenFace-3.0/STAR'...
remote: Enumerating objects: 104, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 104 (delta 18), reused 13 (delta 13), pack-reused 73 (from 1)[K
Receiving objects: 100% (104/104), 995.30 KiB | 26.19 MiB/s, done.
Resolving deltas: 100% (

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import csv
import time
import hashlib
import pandas as pd
import ast

In [None]:
if torch.cuda.is_available():
    print("Using GPU, continue!")
else:
    raise Exception("Not using GPU, if this is intended continue, otherwise switch!")

Using GPU, continue!


### Load Model (No need to run again if already run once and the session has not expired)

In [None]:
%cd OpenFace-3.0/
from openface.face_detection import FaceDetector
from openface.landmark_detection import LandmarkDetector
from openface.multitask_model import MultitaskPredictor

face_detector, landmark_detector, multitask_model = load_model()

/content/OpenFace-3.0


  _C._set_default_tensor_type(t)
INFO:root:Loaded configure file alignment: 94184a6d-8b44-452a-a8b8-f3b204c83cfd
2025-06-19 21:20:01,718 INFO    : Loaded configure file alignment: 94184a6d-8b44-452a-a8b8-f3b204c83cfd
INFO:root:
type: alignment
id: 94184a6d-8b44-452a-a8b8-f3b204c83cfd
note: 
ckpt_dir: /work/jiewenh/openFace/OpenFace-3.0/STAR
image_dir: ./WFLW/WFLW_images
annot_dir: ./
loader_type: alignment
loss_func: STARLoss_v2
batch_size: 32
val_batch_size: 32
test_batch_size: 16
channels: 3
width: 256
height: 256
means: (127.5, 127.5, 127.5)
scale: 0.00784313725490196
display_iteration: 10
milestones: [200, 350, 450]
max_epoch: 500
net: stackedHGnet_v1
nstack: 4
optimizer: adam
learn_rate: 0.001
momentum: 0.01
weight_decay: 1e-05
nesterov: False
scheduler: MultiStepLR
gamma: 0.1
loss_weights: [0.125, 1.25, 1.25, 0.25, 2.5, 2.5, 0.5, 5.0, 5.0, 1.0, 10.0, 10.0]
criterions: ['STARLoss_v2', 'AWingLoss', 'AWingLoss', 'STARLoss_v2', 'AWingLoss', 'AWingLoss', 'STARLoss_v2', 'AWingLoss', 'A

Loading pretrained model from /content/openface_weights/Alignment_RetinaFace.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:300
Namespace(config_name='alignment', device_id='cuda:0')


  model = create_fn(


Loading multitask model from /content/openface_weights/MTL_backbone.pth...


## Run Model

In [None]:
processed_ids = {
    os.path.splitext(os.path.basename(p))[0] for p in all_inference_files
}

processed_ids

set()

**NOTE: If your session expires while you are processing a set of videos, you can run this again with the same folder. The code will not repreocess the videos if they already have an inference csv created!**

 CODE TO FLIP MIRRORED VIDEOS

In [None]:
# Add this function BEFORE processing loop
def flip_video_horizontally(input_path, output_path):
    cap = cv2.VideoCapture(input_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        flipped = cv2.flip(frame, 1)  # Flip horizontally
        out.write(flipped)

    cap.release()
    out.release()


# **Entering the trigger points for each video**

## Input your trigger points here:

In [None]:
trigger_points = []

for file_path in file_paths:
    trigger_point = input(f"Enter the trigger point for video {file_path}: ")
    trigger_points.append(trigger_point)

Enter the trigger point for video /content/drive/MyDrive/[Personal] AI4Good Lab/Training/NP_unfocused_no_AUs_1_left.mp4: 3.5


## Confirm that your trigger points are correct here:
If they are not correct:
1. Re-run the code cell above (where you have to input the trigger point for each video
2. Re-run the code cell below to confirm that your trigger points are correct

In [None]:
for i in range(len(file_paths)):
  print(f"Trigger point for video {file_paths[i]}: {trigger_points[i]}")

Trigger point for video /content/drive/MyDrive/[Personal] AI4Good Lab/Training/NP_unfocused_no_AUs_1_left.mp4: 3.5


# **Processing Each Video:**

In [None]:
for i in range(len(file_paths)):
    file_path = file_paths[i]
    trigger_point = trigger_points[i]

    base_id = hashlib.sha256(file_path.encode()).hexdigest()

    is_mirrored = 'right' in file_path.lower()
    id = f"{base_id}_right" if is_mirrored else base_id

    if id in processed_ids:
        print(f"Skipping already processed video: {file_path}")
        continue

    flip_flag = is_mirrored
    process_video(file_path, face_detector, landmark_detector, multitask_model, id, trigger_point, flip=flip_flag)




-------------------------------------------------------------------


Processing video: /content/drive/MyDrive/[Personal] AI4Good Lab/Training/NP_unfocused_no_AUs_1_left.mp4
Processing face: (np.int64(425), np.int64(160), np.int64(673), np.int64(526)), confidence: 0.9991305470466614


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Processing face: (np.int64(328), np.int64(449), np.int64(375), np.int64(498)), confidence: 0.04562646523118019
Processing face: (np.int64(0), np.int64(406), np.int64(10), np.int64(421)), confidence: 0.03617454320192337
Processing face: (np.int64(426), np.int64(160), np.int64(672), np.int64(525)), confidence: 0.9991661310195923
Processing face: (np.int64(0), np.int64(406), np.int64(10), np.int64(421)), confidence: 0.03376621752977371
Processing face: (np.int64(341), np.int64(455), np.int64(391), np.int64(517)), confidence: 0.029721705242991447
Processing face: (np.int64(907), np.int64(316), np.int64(1127), np.int64(565)), confidence: 0.024716302752494812
Processing face: (np.int64(847), np.int64(680), np.int64(901), np.int64(736)), confidence: 0.020852338522672653
Processing face: (np.int64(426), np.int64(161), np.int64(672), np.int64(525)), confidence: 0.9991753697395325
Processing face: (np.int64(0), np.int64(406), np.int64(10), np.int64(421)), confidence: 0.03107859194278717
Processi

## Read the inference files created for the videos

#### Make Dataset

In [None]:
def flatten_nested_list(s):
    try:
        parsed = ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return []

    def flatten(lst):
        for item in lst:
            if isinstance(item, list):
                yield from flatten(item)
            else:
                yield item
    return list(flatten(parsed))

In [None]:
au_labels = [
    "au1_innerbrow", "au2_outerbrow", "au4_browlower",
    "au6_cheekraise", "au9_nosewrinkle", "au12_lipcorner",
    "au25_lipspart", "au26_jawdrop"
  ]
FEATURES_LABEL = ['id', 'relative_time','left_eye_x', 'left_eye_y', 'right_eye_x', 'right_eye_y', 'face_width', 'gaze_pitch', 'gaze_yaw'] + au_labels + ['is_focused']

def make_dataset(df, fps, trigger_point):
  df['relative_time'] = df.index / fps


  # Find index of closest frame to trigger point
  trigger_point = float(trigger_point)
  closest_idx = (df['relative_time'] - trigger_point).abs().idxmin()

  # Create is_focused column: True for rows up to closest_idx
  df['is_focused'] = df.index <= closest_idx


  df['face_detection_parsed'] = df['face_detection'].apply(ast.literal_eval)
  df['left_eye_x'] = df['face_detection_parsed'].apply(lambda x: x[5])
  df['left_eye_y'] = df['face_detection_parsed'].apply(lambda x: x[6])
  df['right_eye_x'] = df['face_detection_parsed'].apply(lambda x: x[7])
  df['right_eye_y'] = df['face_detection_parsed'].apply(lambda x: x[8])
  df['face_width'] = df['face_detection_parsed'].apply(lambda x: x[2]) - df['face_detection_parsed'].apply(lambda x: x[0])

  df['action_units_parsed'] = df['action_units'].apply(flatten_nested_list)
  for i in range(8):
    df[au_labels[i]] = df['action_units_parsed'].apply(lambda x: x[i])

  df_features = df[FEATURES_LABEL]
  return df_features

In [None]:
folder_path = "/content/drive/MyDrive/Dataset CSV"
all_inference_files = os.listdir(folder_path)
inference_file_paths = [os.path.join(folder_path, f) for f in all_inference_files if f.endswith('.csv')]
print("Found files:", inference_file_paths)


Found files: ['/content/drive/MyDrive/Dataset CSV/7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b9062274c610af07498.csv']


In [None]:
df_list = []
for file_path in inference_file_paths:
    try:
        metadata_df = pd.read_csv(file_path, skiprows=1, nrows=2, encoding='utf-8', header=None)

        if len(metadata_df) < 2:
            raise ValueError("Metadata has fewer than 2 rows")
    except (UnicodeDecodeError, ValueError) as e:
        print(f"❌ Metadata read failed for {file_path}: {e}")
        continue

    try:
        fps = float(metadata_df.iloc[0, 1])
        trigger_point = float(metadata_df.iloc[1, 1])
    except Exception as e:
        print(f"❌ Metadata parse error for {file_path}: {e}")
        continue

    try:
        df = pd.read_csv(file_path, skiprows=3, encoding='utf-8', header=0)
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, skiprows=3, encoding='ISO-8859-1', header=0)


    try:
        df_features = make_dataset(df, fps, trigger_point)
        df_list.append(df_features)
    except Exception as e:
        print(f"❌ Feature extraction failed for {file_path}: {e}")
        continue

# Final check
print(f"✅ Processed {len(df_list)} videos.")
df_list[0].head()


✅ Processed 1 videos.


Unnamed: 0,id,relative_time,left_eye_x,left_eye_y,right_eye_x,right_eye_y,face_width,gaze_pitch,gaze_yaw,au1_innerbrow,au2_outerbrow,au4_browlower,au6_cheekraise,au9_nosewrinkle,au12_lipcorner,au25_lipspart,au26_jawdrop,is_focused
0,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.0,485.062103,307.088806,606.851013,310.311981,248.264221,-0.166144,-0.231094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.033333,484.695679,307.459015,605.949707,310.3862,246.508331,-0.174508,-0.220065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.066667,484.878296,307.691101,605.687378,310.583954,246.179291,-0.160547,-0.239625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.1,485.084808,309.069092,605.297913,311.70578,244.376831,-0.145357,-0.258751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.133333,484.641327,309.374542,605.026794,311.859314,243.76178,-0.140043,-0.253606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


DEBUGGIN

In [None]:
!ls -lh "/content/drive/MyDrive/Dataset CSV"


total 964K
-rw------- 1 root root 964K Jun 19 18:01 7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b9062274c610af07498.csv


In [None]:
!head -n 5 "/content/drive/MyDrive/Dataset CSV/8af7c011bcdcb1b711a6b1d3d3edbd4b12fc3ce4b524c0666e4a6e9e45c1dfb0_right.csv"


head: cannot open '/content/drive/MyDrive/Dataset CSV/8af7c011bcdcb1b711a6b1d3d3edbd4b12fc3ce4b524c0666e4a6e9e45c1dfb0_right.csv' for reading: No such file or directory


In [None]:
combined_df = pd.concat(df_list, ignore_index=True)
combined_df

Unnamed: 0,id,relative_time,left_eye_x,left_eye_y,right_eye_x,right_eye_y,face_width,gaze_pitch,gaze_yaw,au1_innerbrow,au2_outerbrow,au4_browlower,au6_cheekraise,au9_nosewrinkle,au12_lipcorner,au25_lipspart,au26_jawdrop,is_focused
0,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.000000,485.062103,307.088806,606.851013,310.311981,248.264221,-0.166144,-0.231094,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,True
1,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.033333,484.695679,307.459015,605.949707,310.386200,246.508331,-0.174508,-0.220065,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,True
2,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.066667,484.878296,307.691101,605.687378,310.583954,246.179291,-0.160547,-0.239625,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,True
3,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.100000,485.084808,309.069092,605.297913,311.705780,244.376831,-0.145357,-0.258751,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,True
4,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.133333,484.641327,309.374542,605.026794,311.859314,243.761780,-0.140043,-0.253606,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,7.266667,368.107605,329.454926,454.035461,331.693451,237.969025,0.053797,-0.212398,0.313570,0.209053,0.000000,0.000102,0.0,0.000000,0.013147,0.010420,False
219,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,7.300000,368.569336,329.222015,453.345551,331.742279,237.128479,0.018055,-0.210191,0.212448,0.139855,0.000000,0.000154,0.0,0.000000,0.014393,0.011650,False
220,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,7.333333,368.074005,329.760437,453.486694,332.356445,238.444733,0.047588,-0.201225,0.360681,0.273119,0.000000,0.000146,0.0,0.000468,0.015878,0.015509,False
221,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,7.366667,368.458954,329.288391,453.111237,332.112396,237.828064,0.038028,-0.210955,0.191103,0.160093,0.000800,0.000220,0.0,0.002542,0.019276,0.021444,False


TODO: Put in relevant path

In [None]:
combined_df.to_csv("/content/drive/MyDrive/Dataset CSV/test_data.csv", index=False)

# df.to_csv("/content/drive/MyDrive/Dataset CSV/test_data.csv", index=False)

# Testing EDA

In [None]:
combined_df.head(100)

Unnamed: 0,id,relative_time,left_eye_x,left_eye_y,right_eye_x,right_eye_y,face_width,gaze_pitch,gaze_yaw,au1_innerbrow,au2_outerbrow,au4_browlower,au6_cheekraise,au9_nosewrinkle,au12_lipcorner,au25_lipspart,au26_jawdrop,is_focused
0,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.000000,485.062103,307.088806,606.851013,310.311981,248.264221,-0.166144,-0.231094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.033333,484.695679,307.459015,605.949707,310.386200,246.508331,-0.174508,-0.220065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.066667,484.878296,307.691101,605.687378,310.583954,246.179291,-0.160547,-0.239625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.100000,485.084808,309.069092,605.297913,311.705780,244.376831,-0.145357,-0.258751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,0.133333,484.641327,309.374542,605.026794,311.859314,243.761780,-0.140043,-0.253606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,3.166667,485.210754,307.202148,607.027771,310.532990,247.033966,-0.075144,-0.252884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
96,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,3.200000,485.192017,306.801239,606.356140,310.533203,247.042816,-0.086414,-0.236737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
97,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,3.233333,485.140533,306.361328,605.427734,310.661011,246.583679,-0.079631,-0.226843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
98,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,3.266667,485.388306,306.012726,606.096375,311.124573,246.370178,-0.095276,-0.229927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


# Testing mirrored videos

In [None]:
import pandas as pd

# Load your combined dataset
combined_df = pd.read_csv("/content/drive/MyDrive/Combined Datasets/combined_dataset_AS_unfocused_no_aus_right.csv")

# Add a new column to flag mirrored videos
combined_df['is_mirrored'] = combined_df['id'].apply(lambda x: 'right' in x.lower())

# Compare gaze_yaw statistics
print("Original (Left) gaze_yaw stats:")
print(combined_df[~combined_df['is_mirrored']]['gaze_yaw'].describe())

print("\nMirrored (Right) gaze_yaw stats:")
print(combined_df[combined_df['is_mirrored']]['gaze_yaw'].describe())


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Combined Datasets/combined_dataset_AS_unfocused_no_aus_right.csv'

In [None]:
print("Left eye x (mean) — Original:", combined_df[~combined_df['is_mirrored']]['left_eye_x'].mean())
print("Left eye x (mean) — Mirrored:", combined_df[combined_df['is_mirrored']]['left_eye_x'].mean())


Left eye x (mean) — Original: 910.2417380903021
Left eye x (mean) — Mirrored: 1020.4845182856056


ASK JAKE: PROBABLY HERE WE WILL NEED TO MERGE ALL OF OUR DATASETS TO CONTINUE WITH THE FOLLOWING: NORMALIZE, WINDOW SLIDING?

NEXT STEPS:

Confirm File Path
Ensure that the file combined_dataset.csv exists (CHECK FILENAME)

In [None]:
import os
os.listdir("/content/drive/MyDrive/Combined Datasets/")


['Combined Dataset HI.csv',
 'combined_dataset_normalized_HI.csv',
 'Combined Dataset AS.csv',
 'combined_dataset_normalized_AS.csv',
 'combined_dataset_normalized_JP.csv',
 'Copy of combined_dataset_normalized_all3.csv',
 'dataset_complied_v1.csv',
 'combined_dataset_allvideos.csv']

# Normalize Features

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the combined dataset
df = pd.read_csv("/content/drive/MyDrive/Dataset CSV/test_data.csv")

# Select only the relevant numeric features for normalization
features_to_normalize = [
    'left_eye_x', 'left_eye_y', 'right_eye_x', 'right_eye_y',
    'face_width'
]

# Apply StandardScaler normalization
scaler = StandardScaler()
normalized_values = scaler.fit_transform(df[features_to_normalize])
print(scaler.mean_)
print(scaler.var_)

normalized_df = pd.DataFrame(normalized_values, columns=[f"{col}_norm" for col in features_to_normalize])

# Merge back with original
df_normalized = pd.concat([df.reset_index(drop=True), normalized_df], axis=1)

# Save normalized dataset
df_normalized.to_csv("/content/drive/MyDrive/Dataset CSV/test_data_normalized.csv", index=False)
print("✅ Normalized dataset saved.")


✅ Normalized dataset saved.


In [None]:
FEATURES = ['id', 'left_eye_x_norm', 'left_eye_y_norm', 'right_eye_x_norm', 'right_eye_y_norm', 'face_width_norm', 'gaze_pitch', 'gaze_yaw', 'au1_innerbrow', 'au2_outerbrow', 'au4_browlower', 'is_focused']

In [None]:
df_features = df_normalized[FEATURES]
df_features.head()

Unnamed: 0,id,left_eye_x_norm,left_eye_y_norm,right_eye_x_norm,right_eye_y_norm,face_width_norm,gaze_pitch,gaze_yaw,au1_innerbrow,au2_outerbrow,au4_browlower,is_focused
0,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,1.027562,-1.071076,1.034682,-1.047832,1.097949,-0.166144,-0.231094,0.0,0.0,0.0,True
1,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,1.020897,-1.036857,1.022605,-1.040975,0.691857,-0.174508,-0.220065,0.0,0.0,0.0,True
2,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,1.024219,-1.015405,1.01909,-1.022705,0.615759,-0.160547,-0.239625,0.0,0.0,0.0,True
3,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,1.027975,-0.888035,1.013871,-0.919062,0.198897,-0.145357,-0.258751,0.0,0.0,0.0,True
4,7aefab5ed5fdd5e8a617e7c7c8fddd582d2e9164a7711b...,1.019908,-0.859802,1.010238,-0.904878,0.056652,-0.140043,-0.253606,0.0,0.0,0.0,True


Save CSV file to Google Drive Folder

## **TO DO:** Replace output_path with where you want to save your modified file to

In [None]:
# Path to save the new file
output_path = '/content/drive/MyDrive/Dataset CSV/test_data_complied.csv'

# Save the modified DataFrame
df_features.to_csv(output_path, index=False)

print("File saved successfully to:", output_path)

File saved successfully to: /content/drive/MyDrive/Dataset CSV/test_data_complied.csv
