The first step is to organize all the clips into respective folders for easy
indexing.

In [1]:
import os
import shutil

input_dir = '../data/raw_clips'
output_dir = '../data/organized_clips'

os.makedirs(output_dir, exist_ok = True)

for filename in os.listdir(input_dir):
    if not filename.endswith('.mp4'):
        continue

    # Extract lexeme
    parts = filename.replace('.mp4', '').split('_')
    if len(parts) != 4:
        print(f"Skipping: {filename}")
        continue

    lexeme = parts[3]

    # Make lexeme folder if needed
    lexeme_folder = os.path.join(output_dir, lexeme)
    os.makedirs(lexeme_folder, exist_ok=True)

    # Copy file
    src = os.path.join(input_dir, filename)
    dst = os.path.join(lexeme_folder, filename)
    shutil.copy(src, dst)


Now I do the pose estimation with mediapipe.

In [None]:
import cv2
import numpy as np
import mediapipe as mp

input_dir = '../data/organized_clips'
output_dir = '../data/poses'
os.makedirs(output_dir, exist_ok = True)

mp_holistic = mp.solutions.holistic

def landmarks_to_xy(landmarks):
    return [(lm.x, lm.y) for lm in landmarks.landmark] if landmarks else []

# Process each video
with mp_holistic.Holistic(static_image_mode=False, model_complexity=2) as holistic:
    for root, _, files in os.walk(input_dir):
        for filename in files:
            if not filename.endswith('.mp4'):
                continue

            video_path = os.path.join(root, filename)

            # Preserve subfolder structure in output
            rel_dir = os.path.relpath(root, input_dir)
            out_dir = os.path.join(output_dir, rel_dir)
            os.makedirs(out_dir, exist_ok = True)

            out_path = os.path.join(out_dir, filename.replace('.mp4', '.npy'))

            cap = cv2.VideoCapture(video_path)
            sequence = []

            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break

                # Convert to RGB for MediaPipe
                rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = holistic.process(rgb)

                pose = landmarks_to_xy(results.pose_landmarks)
                lh = landmarks_to_xy(results.left_hand_landmarks)
                rh = landmarks_to_xy(results.right_hand_landmarks)

                # Pad missing landmarks
                if not pose: pose = [(0, 0)] * 33
                if not lh: lh = [(0, 0)] * 21
                if not rh: rh = [(0, 0)] * 21

                all_landmarks = np.array(pose + lh + rh)
                sequence.append(all_landmarks)

            cap.release()

            if sequence:
                sequence = np.stack(sequence)
                np.save(out_path, sequence)
                print(f"✅ Saved: {out_path}")
            else:
                print(f"⚠️ No pose detected in: {video_path}")

Downloading model to c:\Users\philj\Projects\dgs\venv\Lib\site-packages\mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite
✅ Saved: ../data/poses\ALLE\1180097_1a1_230.840_ALLE.npy
✅ Saved: ../data/poses\ALLE\1180097_1a1_290.480_ALLE.npy
✅ Saved: ../data/poses\ALLE\1180097_1a1_371.980_ALLE.npy
✅ Saved: ../data/poses\ALLE\1180097_1b1_406.060_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1a1_105.300_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1a1_247.980_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1a1_253.980_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_1083.900_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_1212.600_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_194.980_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_567.380_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_575.340_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_732.680_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_735.380_ALLE.npy
✅ Saved: ../data/poses\ALLE\1204691_1b1_780.420_ALLE.npy
✅ Saved: ../

As a sanity check, I want to overlap the keypoints on top of the videos
so that I can see how they look.

In [None]:
clips_root = '../data/organized_clips'
pose_root = '../data/poses'
output_root = '../data/overlaid_videos'


hand_skeleton = [
    (0,1),(1,2),(2,3),(3,4),
    (0,5),(5,6),(6,7),(7,8),
    (0,9),(9,10),(10,11),(11,12),
    (0,13),(13,14),(14,15),(15,16),
    (0,17),(17,18),(18,19),(19,20)]

left_hand_offset = 33
right_hand_offset = 54

skeleton = [
    (11, 13), (13, 15),
    (12, 14), (14, 16),
    (11, 12),]

skeleton += [(a+left_hand_offset, b+left_hand_offset) for a,b in hand_skeleton]
skeleton += [(a+right_hand_offset, b+right_hand_offset) for a,b in hand_skeleton]

def draw_pose(frame, keypoints):
    height, width, _ = frame.shape
    scaled_keypoints = keypoints.copy()

    scaled_keypoints[:, 0] *= width
    scaled_keypoints[:, 1] *= height

    for x, y in scaled_keypoints:
        cv2.circle(frame, (int(x), int(y)), 5, (0, 255, 0), -1)
    for i, j in skeleton:
        pt1 = (int(scaled_keypoints[i][0]), int(scaled_keypoints[i][1]))
        pt2 = (int(scaled_keypoints[j][0]), int(scaled_keypoints[j][1]))
        cv2.line(frame, pt1, pt2, (255, 0, 0), 2)

for gesture in os.listdir(clips_root):
    clip_folder = os.path.join(clips_root, gesture)
    pose_folder = os.path.join(pose_root, gesture)
    output_folder = os.path.join(output_root, gesture)
    os.makedirs(output_folder, exist_ok=True)

    if not os.path.isdir(clip_folder) or not os.path.isdir(pose_folder):
        continue

    for video_file in os.listdir(clip_folder):
        video_path = os.path.join(clip_folder, video_file)
        name, ext = os.path.splitext(video_file)
        pose_file = os.path.join(pose_folder, name + '.npy')

        if not os.path.isfile(pose_file):
            print(f"No pose file for {video_file}")
            continue

        pose_data = np.load(pose_file)

        cap = cv2.VideoCapture(video_path)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')

        output_path = os.path.join(output_folder, video_file)
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

        frame_idx = 0

        while True:
            ret, frame = cap.read()
            if not ret or frame_idx >= pose_data.shape[0]:
                break

            keypoints = pose_data[frame_idx]
            draw_pose(frame, keypoints)

            out.write(frame)
            frame_idx += 1

        cap.release()
        out.release()
        print(f"Saved overlaid video to {output_path}")

print("All done!")


Saved overlaid video to ../data/overlaid_videos\ALLE\1180097_1a1_230.840_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1180097_1a1_290.480_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1180097_1a1_371.980_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1180097_1b1_406.060_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1a1_105.300_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1a1_247.980_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1a1_253.980_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1b1_1083.900_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1b1_1212.600_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1b1_194.980_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1b1_567.380_ALLE.mp4
Saved overlaid video to ../data/overlaid_videos\ALLE\1204691_1b1_575.340_ALLE.mp4
Saved overlaid