# Data Processing

In [8]:
import os
import sys
import torch
import cv2
import numpy as np
import pandas as pd
from ultralytics import YOLO
from facial_emotion_recognition import EmotionRecognition
import mediapipe as mp
from tqdm import tqdm
import logging
import pympi

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {device}")

2025-11-11 15:39:36,905 [INFO] Using device: cuda


## Silesian Deception Dataset

### .eaf parsing and segmentation

In [9]:
def convert_timestamp_to_frame(timestamp_ms, fps):
    return int((timestamp_ms / 1000.0) * fps)

def segment_video(eaf_path, fps=100):
    eaf = pympi.Elan.Eaf(eaf_path)
    annotations = eaf.get_annotation_data_for_tier('Question')
    
    segments = []
    for i, (start, end, value) in enumerate(annotations):
        if value == 'Correct':
            segments.append((convert_timestamp_to_frame(start, fps), convert_timestamp_to_frame(end, fps), i not in [0, 1, 8]))
    
    return segments

### Face detection and crop (YOLO)

In [10]:
def detect_faces(model, frame):
    results = model(frame, verbose=False)
    if not results or results[0].boxes is None:
        return []
    return results[0].boxes.xyxy.int().tolist()

def face_crop(model, frame):
    boxes = detect_faces(model, frame)

    for _, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box)
        face_crop = frame[y1:y2, x1:x2]
        if face_crop.size == 0:
            continue
        return face_crop
    
    return None

### Resize images to consistent size

In [11]:
def resize_frame(frame, size=(224, 224)):
    return cv2.resize(frame, size)

### Geometric face normalization with MediaPipe

In [12]:
def geometric_normalization(frame, face_mesh):
    LEFT_EYE_LANDMARKS = [33, 133]
    RIGHT_EYE_LANDMARKS = [362, 263]

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if not results.multi_face_landmarks:
        return frame

    landmarks = results.multi_face_landmarks[0].landmark
    h, w, _ = frame.shape

    left_eye = np.array([[landmarks[i].x * w, landmarks[i].y * h] for i in LEFT_EYE_LANDMARKS]).mean(axis=0)
    right_eye = np.array([[landmarks[i].x * w, landmarks[i].y * h] for i in RIGHT_EYE_LANDMARKS]).mean(axis=0)

    dy = right_eye[1] - left_eye[1]
    dx = right_eye[0] - left_eye[0]
    angle = np.degrees(np.arctan2(dy, dx))

    center = tuple(map(float, np.mean([left_eye, right_eye], axis=0)))
    rot_mat = cv2.getRotationMatrix2D(center, angle, 1.0)
    aligned = cv2.warpAffine(frame, rot_mat, (w, h), flags=cv2.INTER_CUBIC)

    return aligned

### Emotion Detection

In [13]:
def get_emotion_probs(frame, emotion_detector):
    if frame.ndim == 3:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    tensor = emotion_detector.transform(frame).unsqueeze(0).to(emotion_detector.device)

    with torch.no_grad():
        output = emotion_detector.network(tensor)
        probs = torch.softmax(output, dim=1).cpu().numpy()[0]

    return {emotion_detector.emotions[i]: float(probs[i]) for i in range(len(probs))}

def detect_emotions(frame, emotion_detector):
    return get_emotion_probs(frame, emotion_detector)

### All together

In [14]:
def process_video(sample_id, video_path, face_detector, emotion_detector, face_mesh, frame_skip):
    results = []

    cap = cv2.VideoCapture(video_path)
    
    eaf_path = video_path.replace('.avi', '.eaf')
    segments = segment_video(eaf_path)
    if not segments:
        logging.warning(f"No segments found in {eaf_path}. Skipping video.")
        return results

    frame_idx = 0

    logging.info(f'Processing video: {video_path}')

    seg_idx = 0
    seg_start, seg_end, label = segments[seg_idx]

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx < seg_start:
            frame_idx += 1
            continue
        if seg_idx >= len(segments):
            break
        if frame_idx > seg_end:
            seg_idx += 1
            sample_id += 1
            if seg_idx >= len(segments):
                break
            seg_start, seg_end, label = segments[seg_idx]
            frame_idx += 1
            continue

        if frame_idx % frame_skip != 0:
            frame_idx += 1
            continue

        face = face_crop(face_detector, frame)
        if face is None:
            frame_idx += 1
            continue

        resized_face = resize_frame(face)
        normalized_face = geometric_normalization(resized_face, face_mesh)
        emotions = detect_emotions(normalized_face, emotion_detector)

        results.append({
            'id': sample_id,
            'frame': frame_idx,
            'deceptive': label,
            **emotions
        })

        frame_idx += 1

    cap.release()
    logging.info(f"âœ… Finished video: {video_path} ({len(results)} frames processed)")
    return sample_id, results


def process_dataset(root_dir='data/silesian_deception_dataset', out_path='processed_data/silesian_deception_dataset/emotions.csv', frame_skip=5, device=device):
    logging.info("ðŸš€ Starting dataset processing...")
    face_detector = YOLO('model_weights/yolov8n-face.pt').to(device)
    emotion_detector = EmotionRecognition(device='gpu' if device == 'cuda' else 'cpu')

    dataset = []
    sample_id = 0

    mp_face_mesh = mp.solutions.face_mesh
    with mp_face_mesh.FaceMesh(
        static_image_mode=True,
        refine_landmarks=True,
        max_num_faces=1
    ) as face_mesh:
        for folder in tqdm(os.listdir(root_dir), desc="Processing folders", file=sys.stdout):
            folder_path = os.path.join(root_dir, folder)
            if not os.path.isdir(folder_path):
                continue

            for file in tqdm(os.listdir(folder_path), desc=f"Processing videos in {folder}", leave=False, file=sys.stdout, dynamic_ncols=True):
                if not file.lower().endswith(".avi"):
                    continue

                video_path = os.path.join(folder_path, file)
                sample_id, video_results = process_video(sample_id, video_path, face_detector, emotion_detector, face_mesh, frame_skip)
                dataset.extend(video_results)
        
    df = pd.DataFrame(dataset)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_csv(out_path, index=False)

    logging.info("ðŸŽ‰ All videos processed successfully!")

### Execution

In [15]:
process_dataset()

2025-11-11 15:39:36,987 [INFO] ðŸš€ Starting dataset processing...


I0000 00:00:1762871977.323078    3461 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1762871977.367169    4072 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 580.95.05), renderer: NVIDIA GeForce RTX 2060/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


[*] Accuracy: 0.9565809379727686
Processing folders:   0%|          | 0/3 [00:00<?, ?it/s]

W0000 00:00:1762871977.369464    4070 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


2025-11-11 15:39:37,380 [INFO] Processing video: data/silesian_deception_dataset/poli2Video/person1.avi


W0000 00:00:1762871977.378274    4068 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


2025-11-11 15:40:17,950 [INFO] âœ… Finished video: data/silesian_deception_dataset/poli2Video/person1.avi (1956 frames processed)
2025-11-11 15:40:17,964 [INFO] Processing video: data/silesian_deception_dataset/poli2Video/person10.avi
2025-11-11 15:40:52,070 [INFO] âœ… Finished video: data/silesian_deception_dataset/poli2Video/person10.avi (1693 frames processed)
2025-11-11 15:40:52,076 [INFO] Processing video: data/silesian_deception_dataset/poli2Video/person11.avi
2025-11-11 15:41:35,317 [INFO] âœ… Finished video: data/silesian_deception_dataset/poli2Video/person11.avi (2151 frames processed)
2025-11-11 15:41:35,330 [INFO] Processing video: data/silesian_deception_dataset/poli2Video/person12.avi
2025-11-11 15:42:18,213 [INFO] âœ… Finished video: data/silesian_deception_dataset/poli2Video/person12.avi (1974 frames processed)
2025-11-11 15:42:18,227 [INFO] Processing video: data/silesian_deception_dataset/poli2Video/person13.avi
2025-11-11 15:42:58,157 [INFO] âœ… Finished video: data/s

### Results

In [16]:
df = pd.read_csv('processed_data/silesian_deception_dataset/emotions.csv')
df.head()

Unnamed: 0,id,frame,deceptive,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral
0,0,835,False,0.114798,0.114798,0.114798,0.114798,0.114798,0.114798,0.311213
1,0,840,False,0.115756,0.115753,0.115753,0.115753,0.115753,0.115753,0.305478
2,0,845,False,0.114949,0.114949,0.114949,0.114949,0.114949,0.114949,0.310307
3,0,850,False,0.114824,0.114824,0.114824,0.114824,0.114824,0.114824,0.311055
4,0,855,False,0.115298,0.115296,0.115296,0.115296,0.115296,0.115296,0.308224
