# Extracting landmarks from the video

ffmpeg is required:
sudo apt install ffmpeg

In [None]:
!pip -q install mediapipe opencv-python moviepy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25h

First download the video

In [None]:
import os
import requests
from moviepy.editor import VideoFileClip

# URL of the video
video_url = "https://media.tagesschau.de/video/2024/1029/TV-20241029-2025-3400.webxxl.h264.mp4"

# Send a GET request to the URL
response = requests.get(video_url, stream=True)

if response.status_code == 200:
    # Try to extract the filename from the Content-Disposition header
    filename = None
    if 'Content-Disposition' in response.headers:
        content_disposition = response.headers['Content-Disposition']
        if 'filename=' in content_disposition:
            filename = content_disposition.split('filename=')[1].strip('\"')

    # Fallback to extracting the filename from the URL if header is not present
    if not filename:
        filename = os.path.basename(video_url)

    # Save the video file
    video_filepath = filename
    with open(video_filepath, "wb") as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    print(f"Video downloaded successfully as {video_filepath}")

    # Extract audio and save as MP3
    audio_filename = os.path.splitext(filename)[0] + ".mp3"  # Change extension to .mp3
    video_clip = VideoFileClip(video_filepath)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(audio_filename)
    audio_clip.close()
    video_clip.close()

    print(f"Audio extracted and saved as {audio_filename}")
else:
    print(f"Failed to download video. Status code: {response.status_code}")


  if event.key is 'enter':



Video downloaded successfully as TV-20241029-2025-3400.webxxl.h264.mp4
MoviePy - Writing audio in TV-20241029-2025-3400.webxxl.h264.mp3


                                                                       

MoviePy - Done.
Audio extracted and saved as TV-20241029-2025-3400.webxxl.h264.mp3




Google's MediaPipe Hands solution outputs a set of 21 3D hand landmarks for each detected hand in the input image or video. These landmarks represent key points on the hand, such as the tips of fingers, joints, and the wrist. Each landmark is identified by its index and provides three key pieces of data:

- x: The normalized x-coordinate of the landmark, relative to the width of the input image. The value is in the range [0, 1].
- y: The normalized y-coordinate of the landmark, relative to the height of the input image. The value is in the range [0, 1].
- z: The normalized z-coordinate of the landmark, which indicates the depth of the landmark relative to the wrist. A smaller value means the landmark is closer to the camera.

In [None]:
import cv2
import mediapipe as mp
import numpy as np
import json
import subprocess
import os
from mediapipe.framework.formats import landmark_pb2

def process_video(input_video_path, output_video_path, landmarks_output_path):
    temp_audio_path = 'temp_audio.aac'
    temp_video_path = 'temp_video.mp4'

    # Extract audio from input video using ffmpeg
    try:
        cmd = [
            'ffmpeg',
            '-y',  # Overwrite output file if it exists
            '-i', input_video_path,
            '-vn',  # No video
            '-acodec', 'copy',  # Copy the audio codec
            temp_audio_path
        ]
        subprocess.run(cmd, check=True)
    except Exception as e:
        print(f"Error extracting audio from {input_video_path}: {e}")
        return

    # Initialize VideoCapture
    cap = cv2.VideoCapture(input_video_path)
    if not cap.isOpened():
        print(f"Error opening video file {input_video_path}")
        return

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Calculate the starting x-coordinate for the right half
    right_half_width = width // 2
    start_x = width - right_half_width  # Start from right half of the width

    # Initialize VideoWriter
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Use 'mp4v' codec for MP4 format
    out = cv2.VideoWriter(temp_video_path, fourcc, fps, (width, height))

    # Initialize the MediaPipe Hands class
    mp_drawing = mp.solutions.drawing_utils
    mp_hands = mp.solutions.hands

    # Open the landmarks output file
    try:
        f_landmarks = open(landmarks_output_path, 'w')
        f_landmarks.write('[')  # Start of JSON array
    except Exception as e:
        print(f"Error opening landmarks file {landmarks_output_path}: {e}")
        return

    with mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=2,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    ) as hands:
        frame_idx = 0
        while cap.isOpened():
            ret, frame_bgr = cap.read()
            if not ret:
                break

            # --- Process Only the right half of the Frame ---
            # Crop the right half of the frame
            right_half_frame = frame_bgr[:, start_x:width]

            # Convert the cropped BGR image to RGB for MediaPipe
            image = cv2.cvtColor(right_half_frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False

            # Detect hands in the cropped frame
            results = hands.process(image)

            # Prepare data structure for the current frame
            frame_landmarks = {
                "frame": frame_idx,
                "hands": []
            }

            # Draw hand landmarks on the original image if any hands are detected
            image.flags.writeable = True

            if results.multi_hand_landmarks and results.multi_handedness:
                for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
                    # Extract hand label (Left or Right)
                    hand_label = handedness.classification[0].label

                    # Extract landmark coordinates (adjusted and normalized to full frame)
                    adjusted_landmarks = []
                    landmarks = []

                    for lm in hand_landmarks.landmark:
                        # Adjust x coordinate
                        adjusted_x = (lm.x * right_half_width + start_x) / width  # Normalize to full frame width
                        adjusted_y = lm.y  # y-coordinate is already normalized

                        # Store adjusted landmarks for drawing
                        adjusted_landmarks.append(landmark_pb2.NormalizedLandmark(
                            x=adjusted_x,
                            y=adjusted_y,
                            z=lm.z
                        ))

                        # Store landmarks for saving
                        landmarks.append({
                            "x": lm.x,
                            "y": lm.y,
                            "z": lm.z
                        })

                    # Create a new landmark list with adjusted landmarks
                    adjusted_hand_landmarks = landmark_pb2.NormalizedLandmarkList(
                        landmark=adjusted_landmarks
                    )

                    # Append hand data to frame_landmarks
                    frame_landmarks["hands"].append({
                        "label": hand_label,
                        "landmarks": landmarks
                    })

                    # Draw the hand landmarks on the original frame
                    mp_drawing.draw_landmarks(
                        image=frame_bgr,
                        landmark_list=adjusted_hand_landmarks,
                        connections=mp_hands.HAND_CONNECTIONS,
                        landmark_drawing_spec=mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2),
                        connection_drawing_spec=mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2)
                    )

            # Write the current frame's landmarks to the JSON file
            json.dump(frame_landmarks, f_landmarks)
            if frame_idx < frame_count - 1:
                f_landmarks.write(',\n')  # Add comma except after the last item

            # Write the processed frame to the output video
            out.write(frame_bgr)

            # Increment frame index
            frame_idx += 1

            # Optional: Display progress
            if frame_idx % 100 == 0:
                print(f"Processed {frame_idx}/{frame_count} frames.")

        # Close the JSON array
        f_landmarks.write(']')
        f_landmarks.close()

    # Release resources
    cap.release()
    out.release()

    # Combine audio and video using ffmpeg
    try:
        cmd = [
            'ffmpeg',
            '-y',  # Overwrite output file if it exists
            '-i', temp_video_path,
            '-i', temp_audio_path,
            '-c:v', 'copy',  # Copy the video stream without re-encoding
            '-c:a', 'aac',
            output_video_path
        ]
        subprocess.run(cmd, check=True)
    except Exception as e:
        print(f"Error combining audio and video: {e}")
        return

    # Clean up temporary files
    os.remove(temp_audio_path)
    os.remove(temp_video_path)

    print(f"Landmarks have been saved to {landmarks_output_path}")
    print(f"Annotated video with audio has been saved to {output_video_path}")


In [None]:
# List of input video file paths
input_videos = [
    'TV-20241029-2025-3400.webxxl.h264.mp4'
    # Add more video file paths as needed
]

# Directory to save output videos and landmarks
output_dir = 'processed_videos'

# Ensure the output directory exists
import os
import time

os.makedirs(output_dir, exist_ok=True)

# Process each video
for input_video in input_videos:
    # Generate output file paths
    base_name = os.path.splitext(os.path.basename(input_video))[0]
    output_video = os.path.join(output_dir, f"{base_name}_processed.mp4")
    landmarks_output = os.path.join(output_dir, f"{base_name}_landmarks.json")

    # Start timer
    start_time = time.time()

    # Call the process_video function
    print(f"Processing {input_video}...")
    process_video(input_video, output_video, landmarks_output)
    print(f"Finished processing {input_video}")

    # Stop timer
    end_time = time.time()

    # Calculate and print elapsed time
    elapsed_time = end_time - start_time
    print(f"Time taken for {input_video}: {elapsed_time:.2f} seconds\n")

Processing TV-20241029-2025-3400.webxxl.h264.mp4...
Processed 100/24616 frames.
Processed 200/24616 frames.
Processed 300/24616 frames.
Processed 400/24616 frames.
Processed 500/24616 frames.
Processed 600/24616 frames.
Processed 700/24616 frames.
Processed 800/24616 frames.
Processed 900/24616 frames.
Processed 1000/24616 frames.
Processed 1100/24616 frames.
Processed 1200/24616 frames.
Processed 1300/24616 frames.
Processed 1400/24616 frames.
Processed 1500/24616 frames.
Processed 1600/24616 frames.
Processed 1700/24616 frames.
Processed 1800/24616 frames.
Processed 1900/24616 frames.
Processed 2000/24616 frames.
Processed 2100/24616 frames.
Processed 2200/24616 frames.
Processed 2300/24616 frames.
Processed 2400/24616 frames.
Processed 2500/24616 frames.
Processed 2600/24616 frames.
Processed 2700/24616 frames.
Processed 2800/24616 frames.
Processed 2900/24616 frames.
Processed 3000/24616 frames.
Processed 3100/24616 frames.
Processed 3200/24616 frames.
Processed 3300/24616 frames.
