Mounts Google Drive and sets up tools to display videos in Colab.

In [None]:
!pip install --upgrade pip
!pip install mediapipe==0.10.9 --quiet
!pip install moviepy tensorflow pandas opencv-python --quiet

In [None]:
from google.colab import drive
import numpy as np
import cv2
import mediapipe as mp
import tensorflow as tf
import os
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from tensorflow.keras.models import load_model

# Mount Google Drive
drive.mount('/content/drive')

Set values

In [None]:
# Define constants
LEFT_HAND_LANDMARKS = 21
RIGHT_HAND_LANDMARKS = 21
POSE_LANDMARKS = 33
FEATURES_PER_FRAME = (LEFT_HAND_LANDMARKS + RIGHT_HAND_LANDMARKS + POSE_LANDMARKS) * 2  # X, Y for each landmark
MAX_FRAMES = 15  # Number of frames used for classification
MODEL_PATH = "/content/drive/MyDrive/ASL_project_GDrive/Models/Copy of asl_lstm_best_curr_epoch_noOther90.h5"
CLASS_NAMES = ['book', 'computer_bk', 'drink', 'i', 'other', 'read', 'science', 'study', 'water']


# Predict and draw landmarks on video

Load MediaPipe models

In [None]:
!wget -O pose_landmarker.task -q https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_heavy/float16/1/pose_landmarker_heavy.task
!wget -O hand_landmarker.task -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task

This code loads MediaPipe’s hand and pose landmark models from .task files, enabling detection of two hands and full-body pose landmarks.

In [None]:
# Initialize the hand landmark model from the .task file
base_options_hand = python.BaseOptions(model_asset_path='hand_landmarker.task')
# Allow detection of up to 2 hands in each frame
options_hand = vision.HandLandmarkerOptions(base_options=base_options_hand, num_hands=2)
# Create the hand landmark detector
hand_landmarker = vision.HandLandmarker.create_from_options(options_hand)

# Initialize the pose landmark model from the .task file
base_options_pose = python.BaseOptions(model_asset_path='pose_landmarker.task')
# Use default pose detection options
options_pose = vision.PoseLandmarkerOptions(base_options=base_options_pose)
# Create the pose landmark detector
pose_landmarker = vision.PoseLandmarker.create_from_options(options_pose)

Load model

In [None]:
model = load_model(MODEL_PATH)

Extracts up to MAX_FRAMES evenly spaced frames from a video and returns a matrix of (x, y) pose and hand landmarks for each frame.

In [None]:
def extract_landmark_matrix_full_video(video_path):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Get video properties: frame rate, width, height
    fps = cap.get(cv2.CAP_PROP_FPS)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Get the total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"total_frames: '{total_frames}'")

    # Choose how many frames to extract (limited to MAX_FRAMES)
    n_frames = min(total_frames, MAX_FRAMES)
    print(f"n_frames: '{n_frames}'")

    # Generate evenly spaced frame indices to sample from the video
    selected_indices = np.linspace(0, total_frames - 1, n_frames, dtype=int)

    frames = []
    current_frame = 0
    selected_set = set(selected_indices)  # Convert to set for fast lookup

    # Read frames one by one
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or len(frames) >= n_frames:
            break

        # Process only the selected frames
        if current_frame in selected_set:
            print(f"current_frame: '{current_frame}'")

            # Convert BGR to RGB (MediaPipe expects RGB)
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)

            # Detect hand and pose landmarks
            hand_result = hand_landmarker.detect(mp_image)
            pose_result = pose_landmarker.detect(mp_image)

            # Initialize frame data with zeros
            frame_data = [0.0] * FEATURES_PER_FRAME

            # Insert hand landmarks (if detected)
            if hand_result.hand_landmarks:
                for idx, hand in enumerate(hand_result.hand_landmarks):
                    # Determine if left or right hand
                    base = 0 if hand_result.handedness[idx][0].category_name == "Left" else 21 * 2
                    for lm_idx, lm in enumerate(hand):
                        frame_data[base + lm_idx * 2] = lm.x
                        frame_data[base + lm_idx * 2 + 1] = lm.y

            # Insert pose landmarks (if detected)
            if pose_result.pose_landmarks:
                base = (21 + 21) * 2  # Offset after both hands
                for lm_idx, lm in enumerate(pose_result.pose_landmarks[0]):
                    frame_data[base + lm_idx * 2] = lm.x
                    frame_data[base + lm_idx * 2 + 1] = lm.y

            # Save the extracted landmarks for this frame
            frames.append(frame_data)

        current_frame += 1

    # Pad the sequence if fewer than n_frames were extracted
    while len(frames) < n_frames:
        frames.append([0.0] * FEATURES_PER_FRAME)

    # Convert to numpy array and return
    frames = np.array(frames[:n_frames])
    return frames

Predicts the ASL word in a video by extracting landmarks, running the model, and returning the predicted class name.

In [None]:
def predict_video_full(video_path):
    # Print which video is being processed
    print(f"Processing video: {video_path}")

    # Extract the (frames x features) landmark matrix from the video
    matrix = extract_landmark_matrix_full_video(video_path)

    # Add a batch dimension to match the model's input shape (1, frames, features)
    matrix = np.expand_dims(matrix, axis=0)

    # Run the model to get prediction probabilities
    pred = model.predict(matrix)

    # Get the index of the highest probability (predicted class)
    label = np.argmax(pred, axis=1)[0]

    # Map the index to the actual word label
    word = CLASS_NAMES[label]

    # Print the predicted word
    print(f"Prediction: '{word}'")

    return word

Run prediction

In [None]:
video_path = "/content/drive/MyDrive/ASL_Project_Mika/video_examples/book.mp4" #can be any sort of video (you can choose from the "video examples" folder or upload yourself)
predict_video_full(video_path)