# INITIAL INSTALLATIONS

In [None]:
%pip install ultralytics
%pip install mediapipe
%pip install opencv-python
%pip install numpy
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cu126
Note: you may need to restart the kernel to use updated packages.


## Only run this if you have a compatibe discrete GPU. Using torch can improve inference speed for YOLOV8

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0)) 

True
NVIDIA GeForce RTX 4060 Laptop GPU


# YOLOV8 VS MEDIAPIPE

In [None]:
import cv2
import time
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
import mediapipe as mp

# we only care about following keypoints:
keypoint_names = [
    'left_shoulder', 'right_shoulder',
    'left_elbow', 'right_elbow',
    'left_wrist', 'right_wrist',
    'left_hip', 'right_hip',
    'left_knee', 'right_knee',
    'left_ankle', 'right_ankle'
]

# COCO (YOLOv8n-pose) indices
yolo_indices = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
# MediaPipe indices for the same joints:
mp_indices = [11,12,13,14,15,16,23,24,25,26,27,28] 

# Set up models
yolo_model = YOLO('yolov8n-pose.pt')
mp_pose = mp.solutions.pose.Pose(static_image_mode=False)

# metrics for comparision
yolo_times = []
mp_times = []
yolo_coords = []
mp_coords = []

#select video
videoSource = "videos\owen-front-right-smith.MOV"
cap = cv2.VideoCapture(videoSource)

frame_count = 0

while cap.isOpened(): # when video still plays
    ret, frame = cap.read()
    if not ret: break # if no more frame, exit

    # YOLOv8 inference
    t0 = time.time()
    yolo_result = yolo_model(frame)

    # only capture person with largest bounding box
    biggest_idx = None
    biggest_area = -1
    for idx, box in enumerate(yolo_result[0].boxes.xyxy):
        x1, y1, x2, y2 = box.cpu().numpy()
        area = (x2 - x1) * (y2 - y1)
        if area > biggest_area:
            biggest_area = area
            biggest_idx = idx

    kpts_yolo = None

    if biggest_idx is not None and yolo_result[0].keypoints.xy is not None and len(yolo_result[0].keypoints.xy) > biggest_idx:
        kp = yolo_result[0].keypoints.xy[biggest_idx]
        kpts_yolo = kp[yolo_indices].cpu().numpy()
    yolo_coords.append(kpts_yolo if kpts_yolo is not None else np.full((12,2), np.nan))
    yolo_times.append(time.time() - t0)

    # MediaPipe inference
    # MediaPipe Pose default only capture 1 closest pose
    t0 = time.time()
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    mp_result = mp_pose.process(rgb)
    mp_times.append(time.time() - t0)
    kpts_mp = None
    if mp_result.pose_landmarks:
        lm = mp_result.pose_landmarks.landmark
        kpts_mp = np.array([[lm[i].x, lm[i].y] for i in mp_indices])  # normalized [0,1]
        # Convert to pixel (optional)
        kpts_mp = kpts_mp * np.array([frame.shape[1], frame.shape[0]])  # (width, height)
    mp_coords.append(kpts_mp if kpts_mp is not None else np.full((12,2), np.nan))

    frame_count += 1

cap.release()

#Metrics and Graphs
yolo_avg_fps = 1.0 / np.mean(yolo_times)
mp_avg_fps = 1.0 / np.mean(mp_times)

plt.bar(['YOLOv8n-pose', 'MediaPipe'], [yolo_avg_fps, mp_avg_fps])
plt.ylabel('Average FPS (Frames Per Second)')
plt.title('Inference Speed Comparison')
plt.show()

# Overlay left wrist y position (as sample metric)
plt.plot([y[4][1] for y in yolo_coords], label='YOLOv8 Left Wrist')
plt.plot([m[4][1] for m in mp_coords], label='MediaPipe Left Wrist')
plt.xlabel('Frame')
plt.ylabel('Left Wrist Y Position (pixels)')
plt.legend()
plt.title('Left Wrist Vertical Movement Comparison')
plt.show()

# Visual overlay for 1 frame (picked frame number 50)
frame_id = 50
cap = cv2.VideoCapture(videoSource)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
ret, frame = cap.read()
if ret:
    yolo_kp = yolo_coords[frame_id]
    mp_kp = mp_coords[frame_id]
    for i in range(12):
        x1, y1 = yolo_kp[i]
        x2, y2 = mp_kp[i]
        cv2.circle(frame, (int(x1), int(y1)), 4, (0,255,0), -1)   # YOLO: green
        cv2.circle(frame, (int(x2), int(y2)), 4, (0,0,255), -1)   # MediaPipe: red
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    plt.title('Overlay: YOLO (green), MediaPipe (red)')
    plt.axis('off')
    plt.show()
cap.release()


0: 640x384 1 person, 25.7ms
Speed: 10.6ms preprocess, 25.7ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 24.5ms
Speed: 3.1ms preprocess, 24.5ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 23.8ms
Speed: 2.7ms preprocess, 23.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 21.7ms
Speed: 2.1ms preprocess, 21.7ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 22.0ms
Speed: 2.7ms preprocess, 22.0ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 21.7ms
Speed: 2.2ms preprocess, 21.7ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 22.1ms
Speed: 2.6ms preprocess, 22.1ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 19.6ms
Speed: 2.0ms preprocess, 19.6ms inference, 1.6ms postprocess per image at shape (1, 3, 6

# YOLOV8

## YOLOV8 test on COCO Dataset


In [1]:
import cv2
from ultralytics import YOLO

model = YOLO('yolov8n-pose.pt')

# select video
video_path = 'videos\owen-side-left-bb.MOV'  
output_path = 'cocoYoloV8sTest.mp4'

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise IOError(f"Cannot open video: {video_path}")

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    results = model(frame)  # YOLOv8 pose inference
    annotated_frame = results[0].plot()
    out.write(annotated_frame)
    frame_count += 1
    if frame_count % 30 == 0:
        print(f'Processed {frame_count} frames...')

cap.release()
out.release()
print(f"Processing completed. Output saved as {output_path}")


0: 640x384 2 persons, 39.5ms
Speed: 3.9ms preprocess, 39.5ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 5.4ms
Speed: 1.5ms preprocess, 5.4ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 5.3ms
Speed: 1.3ms preprocess, 5.3ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 5.5ms
Speed: 1.3ms preprocess, 5.5ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 6.1ms
Speed: 1.2ms preprocess, 6.1ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 5.6ms
Speed: 1.3ms preprocess, 5.6ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 5.5ms
Speed: 1.3ms preprocess, 5.5ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 5.5ms
Speed: 1.5ms preprocess, 5.5ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384

## YOLOV8 test on Custom Dataset