In [2]:
# init mediapipe hand tracker (video version) // NOTE: model is pretty noisy, this might conflict with sam2 labeling
import mediapipe as mp

BaseOptions = mp.tasks.BaseOptions
HandLandmarker = mp.tasks.vision.HandLandmarker
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a hand landmarker instance with the video mode:
num_hands = 2 # important assumption, carries into mediapipe and sam2
options = HandLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='hand_landmarker.task'),
    num_hands=num_hands,
    min_hand_detection_confidence=0.5, # default
    min_hand_presence_confidence=0.5, # default
    min_tracking_confidence=0.5, # consider raising this
    running_mode=VisionRunningMode.VIDEO)

landmarker = HandLandmarker.create_from_options(options)

In [3]:
# visualizing annotated image
import cv2
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(frame, result):
    hand_landmarks_list = result.hand_landmarks
    handedness_list = result.handedness

    # Loop through the detected hands to visualize.
    for idx in range(len(hand_landmarks_list)):
        hand_landmarks = hand_landmarks_list[idx]
        handedness = handedness_list[idx]


        # Draw the hand landmarks.
        hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        hand_landmarks_proto.landmark.extend([
        landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
        ])
        
        solutions.drawing_utils.draw_landmarks(
            frame,
            hand_landmarks_proto,
            solutions.hands.HAND_CONNECTIONS,
            solutions.drawing_styles.get_default_hand_landmarks_style(),
            solutions.drawing_styles.get_default_hand_connections_style()
        )
    
    return frame

In [4]:
# build helper functions to convert mediapipe output to np.array (see sam2 demo)
import numpy as np

def detect(frame:np.ndarray, timestamp:int):
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
    result = landmarker.detect_for_video(mp_image, timestamp)
    return result
    # adds [[21 Landmarks], [21 Landmarks], ...]

In [5]:
# opencv
import cv2
import os
import time

cap = cv2.VideoCapture(os.path.join(os.getcwd(), 'test.mp4'))
num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# shape: (#landmarks per hand=21, #hands, #frames, #dimensions=3)
labels = np.zeros((num_frames, num_hands, 21, 3))

start_ms = int(time.time()*1000)
frame_idx = 0
while(cap.isOpened()):
    ret, frame = cap.read()
    if not ret:
        break

    cur_ms = int(time.time()*1000)
    # run function on test.mp4
    label = detect(frame, cur_ms)
    #label = format_detect(frame, time_idx)

    frame = draw_landmarks_on_image(frame, label)
    cv2.imshow('video', frame)

    formatted = [[(landmark.x, landmark.y, landmark.z) for landmark in hand] for hand in label.hand_landmarks]
    for i in range(len(formatted)):
        labels[frame_idx, i] = formatted[i]

    #labels[frame_idx] = label
    frame_idx += 1
    
    if cv2.waitKey(24) == ord('q'):
        break

#print(label)
#print(labels.shape)
print(f"finished playing after {int(cur_ms - start_ms)/1000} seconds")
cap.release()
cv2.destroyAllWindows()

finished playing after 11.19 seconds


In [10]:
# sam2 config (source: https://github.com/facebookresearch/sam2/blob/main/notebooks/video_predictor_example.ipynb)
import os
    # if using Apple MPS, fall back to CPU for unsupported ops
    #os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image

In [11]:
# select the device for computation
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"using device: {device}")

if device.type == "cuda":
    # use bfloat16 for the entire notebook
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print(
        "\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might "
        "give numerically different outputs and sometimes degraded performance on MPS. "
        "See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion."
    )

using device: cpu


In [None]:
# using samv2 instead of sam2 bc i don't have a gpu on my laptop
# save this for later to do on pc

from sam2.build_sam import build_sam2
from sam2.utils.misc import variant_to_config_mapping
from sam2.sam2_video_predictor import SAM2VideoPredictor
from sam2.utils.visualization import show_masks

model = build_sam2(
    variant_to_config_mapping["large"],
    ckpt_path="./sam2/sam2_hiera_large.pt",
    device="cpu"
)
predictor = SAM2VideoPredictor(model)

TypeError: SAM2Base.__init__() missing 3 required positional arguments: 'image_encoder', 'memory_attention', and 'memory_encoder'

In [6]:
def show_mask(mask, ax, obj_id=None, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        cmap = plt.get_cmap("tab10")
        cmap_idx = 0 if obj_id is None else obj_id
        color = np.array([*cmap(cmap_idx)[:3], 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)


def show_points(coords, labels, ax, marker_size=200):
    pos_points = coords[labels==1]
    neg_points = coords[labels==0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)


def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

In [7]:
# load video

In [None]:
inference_state = model.