# Data Processing

In [None]:
import os
import sys
import torch
import cv2
import numpy as np
import pandas as pd
from ultralytics import YOLO
from facial_emotion_recognition import EmotionRecognition
import mediapipe as mp
from tqdm import tqdm
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)
device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {device}")

2025-11-07 12:19:34,553 [INFO] Using device: cuda


## Silesian Deception Dataset

### Face detection and crop (YOLO)

In [2]:
def detect_faces(model, frame):
    results = model(frame, verbose=False)
    if not results or results[0].boxes is None:
        return []
    return results[0].boxes.xyxy.int().tolist()

def face_crop(model, frame):
    boxes = detect_faces(model, frame)

    for _, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box)
        face_crop = frame[y1:y2, x1:x2]
        if face_crop.size == 0:
            continue
        return face_crop
    
    return None

### Resize images to consistent size

In [3]:
def resize_frame(frame, size=(224, 224)):
    return cv2.resize(frame, size)

### Geometric face normalization with MediaPipe

In [4]:
def geometric_normalization(frame, face_mesh):
    LEFT_EYE_LANDMARKS = [33, 133]
    RIGHT_EYE_LANDMARKS = [362, 263]

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb_frame)

    if not results.multi_face_landmarks:
        return frame

    landmarks = results.multi_face_landmarks[0].landmark
    h, w, _ = frame.shape

    left_eye = np.array([[landmarks[i].x * w, landmarks[i].y * h] for i in LEFT_EYE_LANDMARKS]).mean(axis=0)
    right_eye = np.array([[landmarks[i].x * w, landmarks[i].y * h] for i in RIGHT_EYE_LANDMARKS]).mean(axis=0)

    dy = right_eye[1] - left_eye[1]
    dx = right_eye[0] - left_eye[0]
    angle = np.degrees(np.arctan2(dy, dx))

    center = tuple(map(float, np.mean([left_eye, right_eye], axis=0)))
    rot_mat = cv2.getRotationMatrix2D(center, angle, 1.0)
    aligned = cv2.warpAffine(frame, rot_mat, (w, h), flags=cv2.INTER_CUBIC)

    return aligned

### Emotion Detection

In [5]:
def get_emotion_probs(frame, emotion_detector):
    if frame.ndim == 3:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    tensor = emotion_detector.transform(frame).unsqueeze(0).to(emotion_detector.device)

    with torch.no_grad():
        output = emotion_detector.network(tensor)
        probs = torch.softmax(output, dim=1).cpu().numpy()[0]

    return {emotion_detector.emotions[i]: float(probs[i]) for i in range(len(probs))}

def detect_emotions(frame, emotion_detector):
    return get_emotion_probs(frame, emotion_detector)

### All together

In [None]:
def process_video(video_path, face_detector, emotion_detector, face_mesh, frame_skip, label):
    cap = cv2.VideoCapture(video_path)
    video_name = os.path.splitext(os.path.basename(video_path))[0]

    results = []

    frame_idx = 0

    logging.info(f'Processing video: {video_name}')

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx % frame_skip != 0:
            frame_idx += 1
            continue

        face = face_crop(face_detector, frame)
        if face is None:
            frame_idx += 1
            continue

        resized_face = resize_frame(face)
        normalized_face = geometric_normalization(resized_face, face_mesh)
        emotions = detect_emotions(normalized_face, emotion_detector)

        results.append({
            'video': video_name,
            'frame_idx': frame_idx,
            'deceptive': label,
            **emotions
        })

        frame_idx += 1

    cap.release()
    logging.info(f"âœ… Finished video: {video_name} ({len(results)} frames processed)")
    return results


def process_dataset(root_dir='data/silesian_deception_dataset', out_path='processed_data/silesian_deception_dataset/emotions.csv', frame_skip=5, device=device, deception_folder='poli2video'):
    logging.info("ðŸš€ Starting dataset processing...")
    face_detector = YOLO('model_weights/yolov8n-face.pt').to(device)
    emotion_detector = EmotionRecognition(device='gpu' if device == 'cuda' else 'cpu')

    dataset = []

    mp_face_mesh = mp.solutions.face_mesh
    with mp_face_mesh.FaceMesh(
        static_image_mode=True,
        refine_landmarks=True,
        max_num_faces=1
    ) as face_mesh:
        for folder in tqdm(os.listdir(root_dir), desc="Processing folders", file=sys.stdout):
            folder_path = os.path.join(root_dir, folder)
            if not os.path.isdir(folder_path):
                continue

            for file in tqdm(os.listdir(folder_path), desc=f"Processing videos in {folder}", leave=False, file=sys.stdout, dynamic_ncols=True):
                if not file.lower().endswith(".avi"):
                    continue

                video_path = os.path.join(folder_path, file)
                label = folder.lower() == deception_folder
                video_results = process_video(video_path, face_detector, emotion_detector, face_mesh, frame_skip, label)
                dataset.extend(video_results)
        
    df = pd.DataFrame(dataset)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_csv(out_path, index=False)

    logging.info("ðŸŽ‰ All videos processed successfully!")

### Execution

In [7]:
process_dataset()

2025-11-06 20:47:48,665 [INFO] ðŸš€ Starting dataset processing...


I0000 00:00:1762458468.962017   22954 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1762458469.004400   23005 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 580.95.05), renderer: NVIDIA GeForce RTX 2060/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


[*] Accuracy: 0.9565809379727686
Processing folders:   0%|          | 0/2 [00:00<?, ?it/s]

W0000 00:00:1762458469.006338   23004 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


2025-11-06 20:47:49,011 [INFO] Processing video: person1


W0000 00:00:1762458469.014857   23001 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


2025-11-06 20:48:35,863 [INFO] âœ… Finished video: person1 (2377 frames processed)
2025-11-06 20:48:35,866 [INFO] Processing video: person10
2025-11-06 20:49:14,843 [INFO] âœ… Finished video: person10 (1992 frames processed)
2025-11-06 20:49:14,846 [INFO] Processing video: person11
2025-11-06 20:50:04,046 [INFO] âœ… Finished video: person11 (2520 frames processed)
2025-11-06 20:50:04,050 [INFO] Processing video: person12
2025-11-06 20:51:05,876 [INFO] âœ… Finished video: person12 (3153 frames processed)
2025-11-06 20:51:05,880 [INFO] Processing video: person13
2025-11-06 20:51:51,317 [INFO] âœ… Finished video: person13 (2310 frames processed)
2025-11-06 20:51:51,320 [INFO] Processing video: person14
2025-11-06 20:52:33,225 [INFO] âœ… Finished video: person14 (2132 frames processed)
2025-11-06 20:52:33,235 [INFO] Processing video: person15
2025-11-06 20:53:26,175 [INFO] âœ… Finished video: person15 (2688 frames processed)
2025-11-06 20:53:26,186 [INFO] Processing video: person16
2025-11

### Results

In [3]:
df = pd.read_csv('processed_data/silesian_deception_dataset/emotions.csv')
df.head()

Unnamed: 0,video,frame_idx,deceptive,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral
0,person1,0,True,0.114841,0.114841,0.114841,0.309457,0.114841,0.114841,0.116338
1,person1,5,True,0.115004,0.115004,0.115004,0.309557,0.115004,0.115004,0.115422
2,person1,10,True,0.114703,0.114703,0.114703,0.311774,0.114703,0.114703,0.114709
3,person1,15,True,0.114724,0.114724,0.114724,0.311548,0.114724,0.114724,0.114834
4,person1,20,True,0.114609,0.114609,0.114609,0.310822,0.114609,0.114609,0.116133
