# Notebook to convert the videos into npy files which will be used for training the PyTorch Model

In [1]:
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple, Optional
import imageio
import dlib
from tqdm import tqdm
import warnings
import cv2
import time

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-dark-palette')
plt.rcParams["figure.figsize"] = (9, 5)

In [7]:
# ALL_SPEAKER_IDS = [f's{i}_processed' for i in range(1, 5) if i != 21]
ALL_SPEAKER_IDS = ["s31_processed", "s32_processed", "s33_processed"]
BASE_PROCESSED_PATH = './GRIDCorpus/processed_mouth_data/'
FRAME_COUNT = 75
FRAME_HEIGHT = 50
FRAME_WIDTH = 100
FRAME_CHANNELS = 3

In [8]:
ALL_SPEAKER_IDS

['s31_processed', 's32_processed', 's33_processed']

In [9]:
# --- Mouth Extraction (Mostly unchanged, ensure return type is numpy) ---
try:
    DLIB_LANDMARK_PREDICTOR = "shape_predictor_68_face_landmarks.dat"
    if not os.path.exists(DLIB_LANDMARK_PREDICTOR):
        # Add download/unzip logic here if needed, e.g., using requests/bz2
        print(f"Error: dlib landmark predictor '{DLIB_LANDMARK_PREDICTOR}' not found.")
        print("Please download it from http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
        exit()
    detector = dlib.get_frontal_face_detector()
    predictor = dlib.shape_predictor(DLIB_LANDMARK_PREDICTOR)
except Exception as e:
    print(f"Error initializing dlib: {e}")
    print("Make sure dlib is installed correctly and the predictor file exists.")
    exit()

def extract_mouth_region(frame: np.ndarray) -> Optional[np.ndarray]:
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = detector(gray)
    if len(faces) == 0:
        return None
    face = faces[0]
    landmarks = predictor(gray, face)
    points = np.array([(landmarks.part(n).x, landmarks.part(n).y) for n in range(48, 68)])

    x, y, w, h = cv2.boundingRect(points)
    # Adjust margins carefully - TF code used slightly different logic
    # Let's try to match the TF code's effective crop:
    y_start = max(y + 15 - 30, 0) # y + 15 was start, margin was 30
    y_end = y + 15 + h + 30
    x_start = max(x + 15 - 30, 0) # x + 15 was start, margin was 30
    x_end = x + 15 + w + 30

    # Ensure coordinates are within frame bounds
    y_start = max(0, y_start)
    y_end = min(frame.shape[0], y_end)
    x_start = max(0, x_start)
    x_end = min(frame.shape[1], x_end)

    cropped = frame[y_start:y_end, x_start:x_end]

    if cropped.size == 0: # Handle empty crop
        return np.zeros((FRAME_HEIGHT, FRAME_WIDTH, 3), dtype=np.uint8)

    try:
        cropped = cv2.resize(cropped, (FRAME_WIDTH, FRAME_HEIGHT))
        return cropped
    except cv2.error as e:
        print(f"Warning: cv2.resize error ({e}). Returning zero frame.")
        return np.zeros((FRAME_HEIGHT, FRAME_WIDTH, 3), dtype=np.uint8)

def process_and_save_video(video_path: str, output_dir: str) -> None:
    """Processes a single video, extracts mouth regions, and saves as .npy"""
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error opening video file: {video_path}")
            return None
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            mouth_frame = extract_mouth_region(frame)
            if mouth_frame is not None:
                frames.append(mouth_frame)
            else:
                # If detection fails, append a zero frame
                frames.append(np.zeros((FRAME_HEIGHT, FRAME_WIDTH, 3), dtype=np.uint8))
        cap.release()

        if not frames:
            print(f"Warning: No frames extracted from {video_path}")
            return None

        # Ensure video has FRAME_COUNT frames (pad/truncate if needed)
        frames_np = np.array(frames, dtype=np.uint8) # Keep as uint8 for now
        current_frame_count = frames_np.shape[0]
        if current_frame_count != FRAME_COUNT:
             if current_frame_count > FRAME_COUNT:
                 frames_np = frames_np[:FRAME_COUNT, ...]
             else:
                 pad_width = ((0, FRAME_COUNT - current_frame_count), (0, 0), (0, 0), (0, 0))
                 frames_np = np.pad(frames_np, pad_width, mode='constant', constant_values=0)

        video_id = os.path.splitext(os.path.basename(video_path))[0]
        output_path = os.path.join(output_dir, f"{video_id}_mouth.npy")
        np.save(output_path, frames_np)

    except Exception as e:
        print(f"Error processing video {video_path}: {e}")
    return None


# --- Function to preprocess all videos (Run only once) ---
def preprocess_all_videos(force_reprocess=False):
    print("Starting video preprocessing...")
    processed_count = 0
    skipped_count = 0
    for speaker_id in ALL_SPEAKER_IDS:
        input_videos_dir = os.path.join('./GRIDCorpus/data/', speaker_id) # Assuming video folder structure
        output_preprocessed_dir = os.path.join(BASE_PROCESSED_PATH, speaker_id)
        os.makedirs(output_preprocessed_dir, exist_ok=True)

        print(f"Processing speaker: {speaker_id}")
        video_files = glob.glob(os.path.join(input_videos_dir, '*.mpg')) # Assuming .mpg format

        if not video_files:
             print(f"  Warning: No .mpg files found in {input_videos_dir}")
             continue

        for video_file in tqdm(video_files, desc=f"Speaker {speaker_id}", unit="video"):
             video_id = os.path.splitext(os.path.basename(video_file))[0]
             output_path = os.path.join(output_preprocessed_dir, f"{video_id}_mouth.npy")
             if not force_reprocess and os.path.exists(output_path):
                 skipped_count += 1
                 continue
             process_and_save_video(video_file, output_preprocessed_dir)
             processed_count += 1

    print(f"\nPreprocessing finished. Processed: {processed_count}, Skipped (already exists): {skipped_count}")

# --- UNCOMMENT AND RUN THIS ONCE TO PREPROCESS ---
preprocess_all_videos(force_reprocess=False)
print("Preprocessing complete.")
# --- END PREPROCESSING CALL ---

Starting video preprocessing...
Processing speaker: s31_processed


Speaker s31_processed:  94%|█████████▍| 945/1000 [19:54<01:09,  1.27s/video][mpeg1video @ 0x571bcc0] ac-tex damaged at 22 17
Speaker s31_processed: 100%|██████████| 1000/1000 [21:03<00:00,  1.26s/video]


Processing speaker: s32_processed


Speaker s32_processed: 100%|██████████| 1000/1000 [20:16<00:00,  1.22s/video]


Processing speaker: s33_processed


Speaker s33_processed: 100%|██████████| 1000/1000 [20:04<00:00,  1.20s/video]


Preprocessing finished. Processed: 3000, Skipped (already exists): 0
Preprocessing complete.





In [6]:
len(os.listdir("./GRIDCorpus/processed_mouth_data/s23_processed"))

1000