<a href="https://colab.research.google.com/github/ShovalBenjer/phantom-reach/blob/main/Image_detection_Model_Comparasion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

| Model         | MPII PCKh (%) | COCO Keypoint AP (%) | Speed             | Strengths                                     |
|---------------|---------------|-----------------------|-------------------|-----------------------------------------------|
| Mediapipe     | ~70–80*       | ~60*                 | Excellent (Real-time) | Lightweight, deployable on edge devices.       |
| ViTPose       | ~90+          | 78–82                | Moderate          | Transformer-based, high precision.            |
| HigherHRNet   | ~91           | 75–80                | Moderate          | Excellent for detailed single-person poses.   |
| Lite-HRNet    | ~85           | 65–70                | Good              | Efficient and lightweight for real-time.      |
| PoseNet       | ~80           | ~60–65               | Good              | Simple, good for single-person tasks.         |


# **Dependencies**

In [None]:
!pip install torch==1.13.0+cu116 torchvision==0.14.0+cu116 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu116
!pip install --upgrade pip
!pip install fiftyone
!pip install mediapipe
!pip install opencv-python-headless
!pip install matplotlib
!pip install tqdm
!pip install mmcv-full==1.7.0 -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.13.0/index.html
!pip install mmdet==2.28.1
!pip install mmpose==0.29.0

# **Imports**

In [None]:
# Import libraries
import os
import cv2
import numpy as np
import time
import torch
import fiftyone as fo
import fiftyone.utils.huggingface as fouh
import mediapipe as mp
from mmpose.apis import init_pose_model, inference_top_down_pose_model
from mmpose.datasets import DatasetInfo
from mmdet.apis import init_detector, inference_detector
from tqdm import tqdm
import requests
import mmcv
import mmdet
import mmpose

# **Load and Preprocess the Dataset - Filtering relevant pictures**

In [None]:
# Define the dataset name
dataset_name = "Voxel51/MPII_Human_Pose_Dataset"

# Check if the dataset exists and delete it
if dataset_name in fo.list_datasets():
    fo.delete_dataset(dataset_name)

# Load the dataset
dataset = fouh.load_from_hub("Voxel51/MPII_Human_Pose_Dataset")
print(f"Total dataset size: {len(dataset)} samples.")

In [None]:
def filter_single_person_facing_camera(sample):
    """
    Filters images containing a single person facing the camera.

    Args:
        sample: A single dataset sample.

    Returns:
        bool: True if the sample meets the criteria, False otherwise.
    """
    if sample.annopoints and sample.annopoints.keypoints and len(sample.annopoints.keypoints) == 1:
        kp = sample.annopoints.keypoints[0]
        points = kp.points

        # Ensure we have enough joints
        if len(points) > 13:
            # Extract keypoints of interest
            left_shoulder = points[13]
            right_shoulder = points[12]
            nose = points[0] if len(points) > 0 else None

            # Calculate distance between shoulders
            shoulder_distance = abs(left_shoulder[0] - right_shoulder[0])

            # Check for valid shoulder distance range
            if 25 < shoulder_distance < 75:
                # Ensure left shoulder is to the left of the right shoulder
                if left_shoulder[0] < right_shoulder[0]:
                    # Validate nose alignment (if available)
                    if nose:
                        nose_centered = left_shoulder[0] < nose[0] < right_shoulder[0]
                        if not nose_centered:
                            return False

                    # If all conditions are met, return True
                    return True
    return False

In [None]:
# Apply the filter
filtered_ids = [
    sample.id for sample in filtered_dataset if filter_single_person_facing_camera(sample)
]
filtered_dataset_camera_facing = filtered_dataset.select(filtered_ids)
print(f"Filtered dataset size (single person facing camera): {len(filtered_dataset_camera_facing)}")
# Visualize results
visualize_samples(filtered_dataset_camera_facing, num_samples=10)

In [None]:
# Export filtered dataset with keypoints annotations
filtered_dir = "filtered_mpii"
if not os.path.exists(filtered_dir):
    filtered_dataset.export(
        export_dir=filtered_dir,
        dataset_type=fo.types.FiftyOneDataset,
        label_field="annopoints",  # Ensure keypoints are included
    )
print("Filtered dataset exported.")

# **Model Classes and Evaluation Function**

In [None]:
def download_file(url, dest_path):
    if not os.path.exists(dest_path):
        print(f"Downloading {url} to {dest_path}...")
        r = requests.get(url, allow_redirects=True)
        open(dest_path, 'wb').write(r.content)
        print("Download completed.")
    else:
        print(f"{dest_path} already exists.")

# MediaPipe Pose Model
class MediaPipePoseModel:
    def __init__(self):
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(static_image_mode=True)

    def predict(self, image):
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = self.pose.process(image_rgb)
        if results.pose_landmarks:
            keypoints = []
            for lm in results.pose_landmarks.landmark:
                x = lm.x * image.shape[1]
                y = lm.y * image.shape[0]
                visibility = lm.visibility
                keypoints.append([x, y, visibility])
            return np.array(keypoints)
        return np.array([])

# Lite-HRNet Model using MMPose
class LiteHRNetModel:
    def __init__(self):
        # Download the config and checkpoint files
        config_url = 'https://raw.githubusercontent.com/open-mmlab/mmpose/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/litehrnet/coco/litehrnet_30_coco_256x192.py'
        checkpoint_url = 'https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet_30_coco_256x192-4bdb48f9_20210423.pth'
        config_path = 'litehrnet_30_coco_256x192.py'
        checkpoint_path = 'litehrnet_30_coco_256x192.pth'
        download_file(config_url, config_path)
        download_file(checkpoint_url, checkpoint_path)

        # Initialize the pose model
        self.model = init_pose_model(config_path, checkpoint_path, device='cuda' if torch.cuda.is_available() else 'cpu')
        self.dataset_info = self.model.cfg.data['test'].get('dataset_info', None)
        if self.dataset_info is None:
            raise ValueError("Dataset info is missing in the config file.")
        else:
            self.dataset_info = DatasetInfo(self.dataset_info)

        # Initialize a person detector (YOLOv5)
        detector_config_url = 'https://raw.githubusercontent.com/open-mmlab/mmdetection/master/configs/faster_rcnn/faster_rcnn_r50_fpn_coco.py'
        detector_checkpoint_url = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
        detector_config_path = 'faster_rcnn_r50_fpn_coco.py'
        detector_checkpoint_path = 'faster_rcnn_r50_fpn_coco.pth'
        download_file(detector_config_url, detector_config_path)
        download_file(detector_checkpoint_url, detector_checkpoint_path)

        self.detector = init_detector(detector_config_path, detector_checkpoint_path, device='cuda' if torch.cuda.is_available() else 'cpu')

    def predict(self, image):
        # Detect people in the image
        mmdet_results = inference_detector(self.detector, image)
        # Keep only person class (class_id = 0)
        person_results = []
        for bbox in mmdet_results[0]:
            if bbox[4] >= 0.5:  # Confidence threshold
                person_results.append({'bbox': bbox[:4]})

        if not person_results:
            return np.array([])

        # Run pose estimation
        pose_results, _ = inference_top_down_pose_model(
            self.model,
            image,
            person_results,
            bbox_thr=0.5,
            format='xyxy',
            dataset=self.model.cfg.data['test']['type'],
            dataset_info=self.dataset_info,
            return_heatmap=False,
            outputs=None)

        if pose_results:
            # Assuming single person
            keypoints = pose_results[0]['keypoints']
            return keypoints
        return np.array([])

# ViTPose Model using MMPose
class ViTPoseModel:
    def __init__(self):
        # Download the config and checkpoint files
        config_url = 'https://raw.githubusercontent.com/ViTAE-Transformer/ViTPose/main/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitpose_base_coco_256x192.py'
        checkpoint_url = 'https://github.com/ViTAE-Transformer/ViTPose/releases/download/v0.0.1/vitpose-b-multi-coco.pth'
        config_path = 'vitpose_base_coco_256x192.py'
        checkpoint_path = 'vitpose_base_coco.pth'
        download_file(config_url, config_path)
        download_file(checkpoint_url, checkpoint_path)

        # Initialize the pose model
        self.model = init_pose_model(config_path, checkpoint_path, device='cuda' if torch.cuda.is_available() else 'cpu')
        self.dataset_info = self.model.cfg.data['test'].get('dataset_info', None)
        if self.dataset_info is None:
            raise ValueError("Dataset info is missing in the config file.")
        else:
            self.dataset_info = DatasetInfo(self.dataset_info)

        # Initialize a person detector
        detector_config_url = 'https://raw.githubusercontent.com/open-mmlab/mmdetection/master/configs/faster_rcnn/faster_rcnn_r50_fpn_coco.py'
        detector_checkpoint_url = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
        detector_config_path = 'faster_rcnn_r50_fpn_coco.py'
        detector_checkpoint_path = 'faster_rcnn_r50_fpn_coco.pth'
        download_file(detector_config_url, detector_config_path)
        download_file(detector_checkpoint_url, detector_checkpoint_path)

        self.detector = init_detector(detector_config_path, detector_checkpoint_path, device='cuda' if torch.cuda.is_available() else 'cpu')

    def predict(self, image):
        # Detect people in the image
        mmdet_results = inference_detector(self.detector, image)
        # Keep only person class (class_id = 0)
        person_results = []
        for bbox in mmdet_results[0]:
            if bbox[4] >= 0.5:  # Confidence threshold
                person_results.append({'bbox': bbox[:4]})

        if not person_results:
            return np.array([])

        # Run pose estimation
        pose_results, _ = inference_top_down_pose_model(
            self.model,
            image,
            person_results,
            bbox_thr=0.5,
            format='xyxy',
            dataset=self.model.cfg.data['test']['type'],
            dataset_info=self.dataset_info,
            return_heatmap=False,
            outputs=None)

        if pose_results:
            # Assuming single person
            keypoints = pose_results[0]['keypoints']
            return keypoints
        return np.array([])

def compute_head_size(gt_keypoints):
    """
    Computes the head size based on the distance between head top and upper neck.
    """
    head_top = next((kp for kp in gt_keypoints if kp[2] == 9), None)
    upper_neck = next((kp for kp in gt_keypoints if kp[2] == 8), None)
    if head_top is not None and upper_neck is not None:
        head_size = np.linalg.norm([head_top[0] - upper_neck[0], head_top[1] - upper_neck[1]])
    else:
        head_size = 1.0  # Default value if head keypoints are missing
    return head_size

def pckh(predictions, ground_truths, head_sizes):
    """
    Computes the Percentage of Correct Keypoints normalized by head size (PCKh).
    """
    correct = 0
    total = 0

    for pred, gt, head_size in zip(predictions, ground_truths, head_sizes):
        if pred.shape[0] == 0 or gt.shape[0] == 0:
            continue
        # Align keypoints by id
        pred_dict = {i: kp for i, kp in enumerate(pred)}
        gt_dict = {int(kp[2]): kp[:2] for kp in gt}
        matched_ids = set(pred_dict.keys()).intersection(set(gt_dict.keys()))
        distances = []
        for idx in matched_ids:
            pred_kp = pred_dict[idx][:2]
            gt_kp = gt_dict[idx]
            distance = np.linalg.norm(pred_kp - gt_kp) / head_size
            distances.append(distance)
            if distance < 0.5:
                correct += 1
            total += 1
    return correct / total if total > 0 else 0

def evaluate_model(model, dataset_dir):
    predictions, ground_truths, inference_times, head_sizes = [], [], [], []

    # Load the dataset
    dataset = fo.Dataset.from_dir(
        dataset_dir=dataset_dir,
        dataset_type=fo.types.FiftyOneDataset,
    )

    for sample in tqdm(dataset):
        image_path = sample.filepath
        image = cv2.imread(image_path)

        # Access ground truth keypoints
        gt_annopoints = sample.annopoints[0]
        gt_keypoints = []
        for kp in gt_annopoints.keypoints:
            gt_keypoints.append([kp.x, kp.y, kp.id])
        gt_keypoints = np.array(gt_keypoints)

        # Compute head size (distance between head top and upper neck)
        head_size = compute_head_size(gt_keypoints)

        start_time = time.time()
        pred_keypoints = model.predict(image)
        end_time = time.time()

        predictions.append(pred_keypoints)
        ground_truths.append(gt_keypoints)
        head_sizes.append(head_size)
        inference_times.append(end_time - start_time)

    avg_time = np.mean(inference_times)
    fps = 1 / avg_time if avg_time > 0 else float("inf")
    accuracy = pckh(predictions, ground_truths, head_sizes)

    return {"PCKh": accuracy, "FPS": fps}


In [None]:
if __name__ == "__main__":
    dataset_dir = "filtered_mpii"

    # Initialize models
    print("Initializing MediaPipePoseModel...")
    mediapipe_model = MediaPipePoseModel()

    print("Initializing LiteHRNetModel...")
    lite_hrnet_model = LiteHRNetModel()

    print("Initializing ViTPoseModel...")
    vitpose_model = ViTPoseModel()

    # Evaluate each model
    print("Evaluating MediaPipe...")
    mediapipe_metrics = evaluate_model(mediapipe_model, dataset_dir)
    print("MediaPipe Metrics:", mediapipe_metrics)

    print("Evaluating Lite-HRNet...")
    lite_hrnet_metrics = evaluate_model(lite_hrnet_model, dataset_dir)
    print("Lite-HRNet Metrics:", lite_hrnet_metrics)

    print("Evaluating ViTPose...")
    vitpose_metrics = evaluate_model(vitpose_model, dataset_dir)
    print("ViTPose Metrics:", vitpose_metrics)
