In [None]:
import cv2
import numpy as np
import os
from PIL import Image
import torch
import json
import csv
from collections import defaultdict, deque
from facenet_pytorch import MTCNN
from deep_sort_realtime.deepsort_tracker import DeepSort
from insightface.app import FaceAnalysis
from transformers import ViTForImageClassification
import torchvision.transforms as transforms
from torchvision.transforms import functional as TF
from scipy.signal import butter, filtfilt
from scipy.ndimage import gaussian_filter1d
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [26]:
# === SETUP ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
video_path = r"E:\UET\HMI\Pj\Input\Trich doan 3 (ly truong - me mo).mp4"
model_path = r"E:\UET\HMI\Pj\models\vit_model.pth"

target_size = (224, 224)
num_classes = 12
IMG_SIZE = 224
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

emotion_labels = ['Genuine Disgust', 'Posed Disgust', 'Genuine Happiness', 'Posed Happiness',
            'Genuine Fear', 'Posed Fear', 'Genuine Anger', 'Posed Anger',
            'Genuine Surprise', 'Posed Surprise', 'Genuine Sadness', 'Posed Sadness']
num_classes = len(emotion_labels)

# === Identity Database Setup ===
identity_db = {}
track_to_person = {}
next_person_id = 0

In [27]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [None]:
# === FUNCTIONS ===
def resize_with_aspect_ratio_padding(image_bgr, target_size=IMG_SIZE):
    h, w = image_bgr.shape[:2]
    new_h = target_size
    new_w = int(w * target_size / h)

    img_resized = cv2.resize(image_bgr, (new_w, new_h))
    delta_w = target_size - new_w

    if delta_w > 0:
        left = delta_w // 2
        right = delta_w - left
        img_padded = cv2.copyMakeBorder(img_resized, 0, 0, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    else:
        start_w = (new_w - target_size) // 2
        img_padded = img_resized[:, start_w:start_w + target_size]

    return img_padded

def eulerian_magnify_video(face_frames, freq_min=0.7, freq_max=4.0, amplification=50, fs=30):
    def temporal_bandpass_filter(signal, freq_min, freq_max, fs):
        nyquist = 0.5 * fs
        low = freq_min / nyquist
        high = freq_max / nyquist
        b, a = butter(2, [low, high], btype='band')
        return filtfilt(b, a, signal, axis=0)

    face_stack = np.stack(face_frames).astype(np.float32)
    yuv_stack = np.array([cv2.cvtColor(f, cv2.COLOR_BGR2YUV) for f in face_stack])
    y, u, v = yuv_stack[..., 0], yuv_stack[..., 1], yuv_stack[..., 2]
    u_filtered = temporal_bandpass_filter(u, freq_min, freq_max, fs)
    v_filtered = temporal_bandpass_filter(v, freq_min, freq_max, fs)
    u_amplified = np.clip(u + amplification * u_filtered, 0, 255).astype(np.uint8)
    v_amplified = np.clip(v + amplification * v_filtered, 0, 255).astype(np.uint8)
    magnified = [cv2.cvtColor(cv2.merge([y[i].astype(np.uint8), u_amplified[i], v_amplified[i]]), cv2.COLOR_YUV2BGR) for i in range(len(face_frames))]
    return magnified

def calculate_optical_flow(prev_frame, next_frame):
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    next_gray = cv2.cvtColor(next_frame, cv2.COLOR_BGR2GRAY)
    flow = cv2.calcOpticalFlowFarneback(prev_gray, next_gray, None,
                                        0.5, 3, 15, 3, 5, 1.2, 0)
    return flow

def calculate_motion_intensity(flow):
    mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
    return np.sum(mag)

def detect_segment_start(flow_buffer, min_length=3, sigma=1, motion_threshold=800):
    if len(flow_buffer) < min_length:
        return False

    intensities = []
    for i in range(1, len(flow_buffer)):
        flow = calculate_optical_flow(flow_buffer[i-1], flow_buffer[i])
        intensity = calculate_motion_intensity(flow)
        intensities.append(intensity)

    smooth = gaussian_filter1d(intensities, sigma=sigma)
    gradients = np.gradient(smooth)

    increasing = np.sum(np.array(gradients[-min_length:]) > 0)
    avg_motion = np.mean(smooth[-min_length:])

    return increasing >= (min_length - 1) and avg_motion > motion_threshold

def detect_segment_end(current_segment, stable_count=3, sigma=1, motion_threshold=500):
    if len(current_segment) < stable_count + 1:
        return False

    intensities = []
    for i in range(1, len(current_segment)):
        flow = calculate_optical_flow(current_segment[i-1], current_segment[i])
        intensity = calculate_motion_intensity(flow)
        if intensity == 0:
            return False
        intensities.append(intensity)

    smooth = gaussian_filter1d(intensities, sigma=sigma)
    gradients = np.gradient(smooth)

    recent_grad = gradients[-stable_count:]
    recent_motion = smooth[-stable_count:]
    is_decreasing = np.sum(np.array(recent_grad) < 0) >= (stable_count - 1)
    is_low_motion = np.mean(recent_motion) < motion_threshold

    return is_decreasing or is_low_motion

def compute_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, yB - yA) * max(0, xB - xA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou


def compute_cosine_sim(a, b):
    return cosine_similarity([a], [b])[0][0]

def assign_person_id(track, frame_id, threshold=0.65):
    global next_person_id

    emb = np.mean(track.features, axis=0)
    current_bbox = track.to_tlbr()
    best_id, best_sim = None, 0
    iou_threshold = 0.5
    sim_secondary_threshold = 0.6

    for pid, info in identity_db.items():
        sim = cosine_similarity(emb, info['embedding'])
        if sim > best_sim:
            best_sim = sim
            best_id = pid

    if best_sim >= threshold:
        identity_db[best_id]['embedding'] = 0.95 * identity_db[best_id]['embedding'] + 0.05 * emb
        identity_db[best_id]['last_seen'] = frame_id
        identity_db[best_id]['bbox'] = current_bbox
        return best_id

    # Gán person mới
    person_id = next_person_id
    next_person_id += 1
    print("Số lượng person ID:", next_person_id)
    identity_db[person_id] = {
        'embedding': emb,
        'first_seen': frame_id,
        'last_seen': frame_id,
        'bbox': current_bbox
    }
    return person_id

    
def align_face(frame, landmarks, output_size=(112, 112)):
    dst = np.array([
        [38.2946, 51.6963],
        [73.5318, 51.5014],
        [56.0252, 71.7366],
        [41.5493, 92.3655],
        [70.7299, 92.2041]
    ], dtype=np.float32)

    src = np.array(landmarks, dtype=np.float32)
    M = cv2.estimateAffinePartial2D(src, dst, method=cv2.LMEDS)[0]
    aligned = cv2.warpAffine(frame, M, output_size, borderValue=0.0)
    return aligned

In [None]:
# === LOAD MODELS ===
app = FaceAnalysis(name="buffalo_l") 
app.prepare(ctx_id=0) 
tracker = DeepSort(max_age= 30, n_init=3, nn_budget=100)

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
model.classifier = torch.nn.Linear(in_features=768, out_features=num_classes, bias=True)
model.to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()



Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Admin/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Admin/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Admin/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Admin/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Admin/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

  self.model.load_state_dict(torch.load(model_wts_path))
  model.load_state_dict(torch.load(model_path, map_location=device))


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [None]:
# === VIDEO PROCESSING ===
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_id = 0
active_tracks = {}
flow_buffer = {}
all_segments = []


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame_id += 1
    faces = app.get(frame)

    detections = []
    for face in faces:
        x1, y1, x2, y2 = face.bbox.astype(int)
        conf = face.det_score
        if conf < 0.8 or x2 - x1 < 70 or y2 - y1 < 70:
            continue
        detections.append(([x1, y1, x2 - x1, y2 - y1], conf, face.embedding))
    tracks = tracker.update_tracks(detections, frame=frame)
    
    for track in tracks:
        if not track.is_confirmed() or track.time_since_update > 0:
            continue
        l, t, r, b = track.to_ltrb()
        l, t, r, b = int(l), int(t), int(r), int(b)  # thêm đoạn này
        h, w = frame.shape[:2]
        l = max(0, l)
        t = max(0, t)
        r = min(w, r)
        b = min(h, b)
        face_img = frame[t:b, l:r]
        face_img = resize_with_aspect_ratio_padding(face_img, target_size=IMG_SIZE)
        person_id = assign_person_id(track, face.bbox, frame_id)
        DEBUG_DIR = r"E:\UET\HMI\Pj\Debug"
        save_dir = os.path.join(DEBUG_DIR, f"person_{person_id}")
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, f"frame_{frame_id}.jpg")
        cv2.imwrite(save_path, face_img)
        
        if person_id in active_tracks:
            data = active_tracks[person_id]
            data['end-time'] = frame_id / fps
            data['end-frame'] = frame_id
            data['segment'].append(face_img)
            end_segment = detect_segment_end(data['segment'])
            if end_segment:
                all_segments.append(data)
                del active_tracks[person_id]
                if person_id in flow_buffer:
                    del flow_buffer[person_id]
        else:
            active_tracks[person_id] = {
                'start-time': None,
                'start-frame': frame_id,
                'end-time': None,
                'end-frame': None,
                'segment': [],
                'person_id': person_id
            }
            start_segment = False
            if len(flow_buffer.get(person_id, [])) >= 2:
                start_segment = detect_segment_start(flow_buffer[person_id])
            else:
                start_segment = False
            if start_segment:
                active_tracks[person_id]['segment'].append(face_img)                
                active_tracks[person_id]['start-time'] = frame_id / fps
                active_tracks[person_id]['start-frame'] = frame_id
                active_tracks[person_id]['end-time'] = frame_id / fps
                active_tracks[person_id]['end-frame'] = frame_id
            else:
                if flow_buffer.get(person_id) is None:
                    flow_buffer[person_id] = [face_img]
                else:
                    flow_buffer[person_id].append(face_img)     
cap.release()

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


IndexError: list index out of range

In [None]:
# Xử lý các đoạn video đã lưu
cap = cv2.VideoCapture(video_path)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
video_name = os.path.splitext(os.path.basename(video_path))[0]
play_name, excerpt_name = video_name.split("(")[0].strip(), video_name.split("(")[-1].replace(")", "").strip()

output_dir = os.path.join("Output", video_name)
apex_folder = os.path.join(output_dir, "apex_images")
segment_folder = os.path.join(output_dir, "segment_images")
os.makedirs(apex_folder, exist_ok=True)
os.makedirs(segment_folder, exist_ok=True)
results = []
image_label_list = []
count = 0
for data in all_segments:
    segment = data['segment']
    if len(segment) < 2:
        continue
    
    # segment = eulerian_magnify_video(segment, fs=fps)
    onset_frame_id = data['start-frame']
    onset_time = data['start-time']

    apex_idx, highest = 0, -1
    for i, face in enumerate(segment):
        flow = calculate_optical_flow(segment[0], face)
        intensity = calculate_motion_intensity(flow)
        if intensity > highest:
            highest = intensity
            apex_idx = i
    
    apex_frame_id = onset_frame_id + apex_idx
    apex_time = apex_frame_id / fps
    offset_frame_id = data['end-frame']
    offset_time = data['end-time']
    apex_face = segment[apex_idx]

    apex_filename = f"{video_name}_{excerpt_name}_{data['person_id']}_apex_frame_{apex_frame_id}.jpg"
    apex_image_path = os.path.join(apex_folder, apex_filename)
    apex_face_pil = Image.fromarray(cv2.cvtColor(apex_face, cv2.COLOR_BGR2RGB))
    apex_face_pil.save(apex_image_path)
    
    
    try:
        input_tensor = transform(cv2.cvtColor(apex_face, cv2.COLOR_BGR2RGB)).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_tensor)
            pred_idx = torch.argmax(output.logits, dim=1).item()
            emotion = emotion_labels[pred_idx]
    except:
        emotion = "Unknown"
    
    image_folder = os.path.join(output_dir, "images")
    os.makedirs(image_folder, exist_ok=True)
    
    for idx, face in enumerate(segment):
        img_file_name = f"{video_name}_{excerpt_name}_{data['person_id']}_frame_{idx}_{count}.jpg"
        count += 1
        img_path = os.path.join(segment_folder, img_file_name)
        face_pil = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
        face_pil.save(img_path)

    
    
    results.append({
        'person_id': data['person_id'],
        'onset_frame_id': onset_frame_id,
        'onset_time': onset_time,
        'apex_frame_id': apex_frame_id,
        'apex_time': apex_time,
        'offset_frame_id': offset_frame_id,
        'offset_time': offset_time,
        'emotion': emotion
    })

In [None]:
# === SAVE FILES ===
with open(os.path.join(output_dir, f"{video_name}_labels.csv"), 'w', newline='', encoding='utf-8') as f:
    csv.writer(f).writerows([["filename", "label"]] + image_label_list)

# Save JSON
with open(os.path.join(output_dir, "emotion_results.json"), 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=4)

# Save full CSV
with open(os.path.join(output_dir, "emotion_results.csv"), 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        'Play Name', 'Excerpt', 'Person ID',
        'Onset Frame ID', 
        'Apex Frame ID', 
        'Offset Frame ID',
        'Emotion'
    ])
    for r in results:
        img_file_name = f"{video_name}_{excerpt_name}_person{r['person_id']}_apex.jpg"
        writer.writerow([
            excerpt_name, play_name, r['person_id'],
            r['onset_frame_id'], 
            r['apex_frame_id'],
            r['offset_frame_id'],
            r['emotion'], img_file_name
        ])
