##### Copyright 2023 The MediaPipe Authors. All Rights Reserved.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gesture Recognizer with MediaPipe Tasks

This notebook shows you how to use MediaPipe Tasks Python API to recognize hand gestures in images.

## Preparation

Let's start with installing MediaPipe.

In [None]:
!pip install -q mediapipe==0.10.0

Then download an off-the-shelf model. This model can recognize 7 hand gestures: 👍, 👎, ✌️, ☝️, ✊, 👋, 🤟

Check out the [MediaPipe documentation](https://developers.google.com/mediapipe/solutions/vision/gesture_recognizer#models) for more details about the model.

In [None]:
!wget -q https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/1/gesture_recognizer.task

## Visualization Utilities

In [None]:
#@markdown We implemented some functions to visualize the gesture recognition results. <br/> Run the following cell to activate the functions.
from matplotlib import pyplot as plt
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2

plt.rcParams.update({
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.spines.left': False,
    'axes.spines.bottom': False,
    'xtick.labelbottom': False,
    'xtick.bottom': False,
    'ytick.labelleft': False,
    'ytick.left': False,
    'xtick.labeltop': False,
    'xtick.top': False,
    'ytick.labelright': False,
    'ytick.right': False
})

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles


def display_one_image(image, title, subplot, titlesize=16):
    """Displays one image along with the predicted category name and score."""
    plt.subplot(*subplot)
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize), color='black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)


def display_batch_of_images_with_gestures_and_hand_landmarks(images, results):
    """Displays a batch of images with the gesture category and its score along with the hand landmarks."""
    # Images and labels.
    images = [image.numpy_view() for image in images]
    gestures = [top_gesture for (top_gesture, _) in results]
    multi_hand_landmarks_list = [multi_hand_landmarks for (_, multi_hand_landmarks) in results]

    # Auto-squaring: this will drop data that does not fit into square or square-ish rectangle.
    rows = int(math.sqrt(len(images)))
    cols = len(images) // rows

    # Size and spacing.
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))

    # Display gestures and hand landmarks.
    for i, (image, gestures) in enumerate(zip(images[:rows*cols], gestures[:rows*cols])):
        title = f"{gestures.category_name} ({gestures.score:.2f})"
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols) * 40 + 3
        annotated_image = image.copy()

        for hand_landmarks in multi_hand_landmarks_list[i]:
          hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
          hand_landmarks_proto.landmark.extend([
            landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
          ])

          mp_drawing.draw_landmarks(
            annotated_image,
            hand_landmarks_proto,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style())

        subplot = display_one_image(annotated_image, title, subplot, titlesize=dynamic_titlesize)

    # Layout.
    plt.tight_layout()
    plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

## Download test images

Let's grab some test images that we'll use later. The images ([1](https://pixabay.com/photos/idea-pointing-raise-hand-raise-3082824/), [2](https://pixabay.com/photos/thumbs-up-happy-positive-woman-2649310/), [3](https://pixabay.com/photos/epidemic-disease-coronavirus-5082474/), [4](https://pixabay.com/photos/thumbs-down-disapprove-gesture-6744094/)) are from Pixabay.

In [None]:
import urllib

IMAGE_FILENAMES = ['thumbs_down.jpg', 'victory.jpg', 'thumbs_up.jpg', 'pointing_up.jpg']

for name in IMAGE_FILENAMES:
  url = f'https://storage.googleapis.com/mediapipe-tasks/gesture_recognizer/{name}'
  urllib.request.urlretrieve(url, name)

Optionally, you can upload your own image. If you want to do so, uncomment and run the cell below.

In [None]:
# from google.colab import files
# uploaded = files.upload()

# for filename in uploaded:
#   content = uploaded[filename]
#   with open(filename, 'wb') as f:
#     f.write(content)
# IMAGE_FILENAMES = list(uploaded.keys())

# print('Uploaded files:', IMAGE_FILENAMES)

Then let's check out the images.

In [None]:
import cv2

import math

DESIRED_HEIGHT = 480
DESIRED_WIDTH = 480

def resize_and_show(image):
  h, w = image.shape[:2]
  if h < w:
    img = cv2.resize(image, (DESIRED_WIDTH, math.floor(h/(w/DESIRED_WIDTH))))
  else:
    img = cv2.resize(image, (math.floor(w/(h/DESIRED_HEIGHT)), DESIRED_HEIGHT))
  cv2.imshow("test", img)
  cv2.waitKey(0)
  cv2.destroyAllWindows()

# Preview the images.
images = {name: cv2.imread(name) for name in IMAGE_FILENAMES}
for name, image in images.items():
  print(name)
  resize_and_show(image)

## Running inference and visualizing the results

Here are the steps to run gesture recognizer using MediaPipe.

Check out the [MediaPipe documentation](https://developers.google.com/mediapipe/solutions/vision/gesture_recognizer/python) to learn more about configuration options that this solution supports.

*Note: Gesture Recognizer also returns the hand landmark it detects from the image, together with other useful information such as whether the hand(s) detected are left hand or right hand.*

In [1]:
# STEP 1: Import the necessary modules.
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

# STEP 2: Create an GestureRecognizer object.
base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
options = vision.GestureRecognizerOptions(base_options=base_options)
recognizer = vision.GestureRecognizer.create_from_options(options)

images = []
results = []
for image_file_name in IMAGE_FILENAMES:
  # STEP 3: Load the input image.
  image = mp.Image.create_from_file(image_file_name)
  print(image)
  # STEP 4: Recognize gestures in the input image.
  recognition_result = recognizer.recognize(image)

  # STEP 5: Process the result. In this case, visualize it.
  images.append(image)
  top_gesture = recognition_result.gestures[0][0]
  hand_landmarks = recognition_result.hand_landmarks
  results.append((top_gesture, hand_landmarks))

display_batch_of_images_with_gestures_and_hand_landmarks(images, results)

2023-09-10 01:47:03.717100: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-10 01:47:03.737585: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-10 01:47:03.929409: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-10 01:47:03.931446: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
W20230910 01:47:05.854771 941189 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. 

NameError: name 'IMAGE_FILENAMES' is not defined

## Video stream hands detection using image api

In [1]:
def cropToSmallestSide(image):
  img_h, img_w, cannel = image.shape
  frame_size = min(img_w, img_h)
  crop_w = max((img_w - frame_size) // 2, 0)
  # print("Cropping on width :", crop_w)
  crop_h = max((img_h - frame_size) // 2, 0)
  # print("Cropping on height :", crop_h)
  pad_w = max((frame_size - img_w) // 2, 0)
  # print("Padding on width :", pad_w)
  pad_h = max((frame_size - img_h) // 2, 0)
  # print("Padding on height :", pad_h)
  new_img_h = new_img_w = frame_size
  # print(f"New Frame working size: {new_img_w}x{new_img_h}")
  ### the .copy is important, as slicing up he image makes it unviable for mp.Image constructor
  cropped_image = image[crop_h:img_h-crop_h, crop_w:img_w-crop_w].copy()
  return cropped_image

def zoom_at(img, zoom, coord=None):
    """
    Simple image zooming without boundary checking.
    Centered at "coord", if given, else the image center.
    img: numpy.ndarray of shape (h,w,:)
    zoom: float
    coord: (float, float)
    """
    # Translate to zoomed coordinates
    h, w, _ = [ zoom * i for i in img.shape ]
    if coord is None: cx, cy = w/2, h/2
    else: cx, cy = [ zoom*c for c in coord ]
    img = cv2.resize( img, (0, 0), fx=zoom, fy=zoom)
    zoomedImg = img[ int(round(cy - h/zoom * .5)) : int(round(cy + h/zoom * .5)),
               int(round(cx - w/zoom * .5)) : int(round(cx + w/zoom * .5)),
               : ].copy()
    return zoomedImg

In [2]:
# STEP 0: Import the necessary modules.
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from mediapipe.framework.formats import landmark_pb2
import cv2
import numpy as np

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

def display(image, top_gestures, hands_landmarks, handedness):
  """Displays a batch of images with the gesture category and its score along with the hand landmarks."""
  annotated_image = image.copy()
  ### amount of hands to display 
  n = len(top_gestures)
  ### Display gestures and hand landmarks.
  if hands_landmarks and top_gestures:
    ### create skeleton and landmark
    for hand_landmarks in hands_landmarks:
      hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
      hand_landmarks_proto.landmark.extend([
        landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
      ])
      ### draw landmark
      mp_drawing.draw_landmarks(
        annotated_image,
        hand_landmarks_proto,
        mp_hands.HAND_CONNECTIONS,
        mp_drawing_styles.get_default_hand_landmarks_style(),
        mp_drawing_styles.get_default_hand_connections_style())
      ### draw title
      side_d = {}
      if handedness:
        ### remember which category (left, right is which index in the hand landmar list)
        side_d = {c[0].category_name : i for i, c in enumerate(handedness)}
        font = cv2.FONT_HERSHEY_SIMPLEX
        if "Left" in side_d:
          titleLeft = f"L:{top_gestures[side_d['Left']].category_name} ({top_gestures[side_d['Left']].score:.2f})"
          cv2.putText(annotated_image, titleLeft, (10,30), font, 0.7, (0, 255, 0), 2, cv2.LINE_AA)
        if "Right" in side_d:
          titleRight = f"R:{top_gestures[side_d['Right']].category_name} ({top_gestures[side_d['Right']].score:.2f})"
          cv2.putText(annotated_image, titleRight, (400,30), font, 0.7, (0, 255, 0), 2, cv2.LINE_AA)
  cv2.imshow("Video", annotated_image)


# STEP 1: camera input
# cap = cv2.VideoCapture(0)
cap = cv2.VideoCapture(0) # this is the magic!
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
# STEP 2: Create an GestureRecognizer object.
base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
options = vision.GestureRecognizerOptions(base_options=base_options, 
                                          num_hands=2,
                                          min_hand_detection_confidence=0.1,
                                          min_hand_presence_confidence=0.1,
                                          min_tracking_confidence=0.1,
)
recognizer = vision.GestureRecognizer.create_from_options(options)

while cap.isOpened():
  # STEP 3: Load the input image.
  success, image = cap.read()
  if success:
    ### flip image, so that hand sides are correct
    image = cv2.flip(image, 1)
    mpImage = mp.Image(image_format=mp.ImageFormat.SRGB, data=image) ### numpy image to mpImage
    # STEP 4: Recognize gestures in the input image.
    recognition_result = recognizer.recognize(mpImage)
    # # STEP 5: Process the result. In this case, visualize it.
    top_gestures = []
    hands_landmarks = None
    if recognition_result.gestures and recognition_result.hand_landmarks:
      for top in recognition_result.gestures:
        top_gestures.append(top[0])
      hands_landmarks = recognition_result.hand_landmarks
    display(mpImage.numpy_view(), top_gestures, hands_landmarks, recognition_result.handedness) ##mpImage back to numpy image
    res = cv2.waitKey(1)
    if res != -1:
      break
cv2.destroyAllWindows()
cap.release()
print("Done")


2023-09-10 02:33:00.502557: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-10 02:33:00.523941: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-10 02:33:00.731926: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-10 02:33:00.734673: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
W20230910 02:33:02.758947 945647 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. 

Done


## Video stream hands detection using video api

In [55]:
# STEP 0: Import the necessary modules.
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from mediapipe.framework.formats import landmark_pb2
import cv2
import numpy as np

class GestureRecognizer:
  def __init__(self, num_hands=2):
      self.mp_hands = mp.solutions.hands
      self.mp_drawing = mp.solutions.drawing_utils
      self.mp_drawing_styles = mp.solutions.drawing_styles
      self.stamp = 0
      # STEP 2: Create an GestureRecognizer object.
      VisionRunningMode = mp.tasks.vision.RunningMode
      base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
      options = vision.GestureRecognizerOptions(base_options=base_options, 
                                                running_mode=VisionRunningMode.LIVE_STREAM,
                                                num_hands=num_hands,
                                                min_hand_detection_confidence=0.1,
                                                min_hand_presence_confidence=0.8,
                                                min_tracking_confidence=0.1,
                                                result_callback=self.process_frame
      )
      self.recognizer = vision.GestureRecognizer.create_from_options(options)
      self.results = None

  def process_frame(self, result: vision.GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    self.results = result

  def update(self, image):
    ### flip image, so that hand sides are correct
    image = cv2.flip(image, 1)
    ### crop image to nearest side, so that it gets rectangular and the focus is tn the middle of the image
    cropped = cropToSmallestSide(image)
    # zoomed = zoom_at(cropped, 1.5)
    # filtered = cv2.GaussianBlur(zoomed,(5,5),0)
    mpImage = mp.Image(image_format=mp.ImageFormat.SRGB, data=cropped) ### numpy image to mpImage
    # recognize gestures in the input image.
    self.recognizer.recognize_async(mpImage, self.stamp)
    ### calculate time
    self.stamp = self.stamp + 1
    # process the result. In this case, visualize it.
    top_gestures = []
    hands_landmarks = None
    handedness = None
    if self.results:
      # if recognition_result.gestures and recognition_result.hand_landmarks:
      for top in self.results.gestures:
        top_gestures.append(top[0])
      hands_landmarks = self.results.hand_landmarks
      handedness = self.results.handedness
    resultImage = self.draw(mpImage.numpy_view(), top_gestures, hands_landmarks, handedness) ##mpImage back to numpy image
    return resultImage
    
  def draw(self, image, top_gestures, hands_landmarks, handedness):
    annotated_image = image.copy()
    ### amount of hands to display 
    n = len(top_gestures)
    ### Display gestures and hand landmarks.
    if hands_landmarks and top_gestures:
      ### create skeleton and landmark
      for hand_landmarks in hands_landmarks:
        hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        hand_landmarks_proto.landmark.extend([
          landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
        ])
        ### draw landmark
        self.mp_drawing.draw_landmarks(
          annotated_image,
          hand_landmarks_proto,
          self.mp_hands.HAND_CONNECTIONS,
          self.mp_drawing_styles.get_default_hand_landmarks_style(),
          self.mp_drawing_styles.get_default_hand_connections_style())
        ### draw title
        side_d = {}
        if handedness:
          ### remember which category (left, right is which index in the hand landmar list)
          side_d = {c[0].category_name : i for i, c in enumerate(handedness)}
          font = cv2.FONT_HERSHEY_SIMPLEX
          if "Left" in side_d:
            titleLeft = f"L:{top_gestures[side_d['Left']].category_name} ({top_gestures[side_d['Left']].score:.2f})"
            cv2.putText(annotated_image, titleLeft, (10,30), font, 0.7, (0, 255, 0), 2, cv2.LINE_AA)
          if "Right" in side_d:
            titleRight = f"R:{top_gestures[side_d['Right']].category_name} ({top_gestures[side_d['Right']].score:.2f})"
            cv2.putText(annotated_image, titleRight, (400,30), font, 0.7, (0, 255, 0), 2, cv2.LINE_AA)
    return annotated_image

###
### main
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
cap.set(cv2.CAP_PROP_FPS, 20)
fps = 20
### instantiate gesture recognizer
r = GestureRecognizer()
### read images from camera and feed into recognizer class
while cap.isOpened():
  success, image = cap.read()
  if success:
    resultImage = r.update(image)
    cv2.imshow("Video", resultImage)
  ### exit condition is random keypress
  if cv2.waitKey(1) != -1:
    break
cv2.destroyAllWindows()
cap.release()
print("Done")


W20230910 03:13:17.112682 945647 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I20230910 03:13:17.114991 945647 hand_gesture_recognizer_graph.cc:249] Custom gesture classifier is not defined.


Done


## human pose detector

In [4]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np
import cv2

class HumanPoseDetector:
  def __init__(self):
    self.stamp = 0
    ### create an PoseLandmarker object.
    VisionRunningMode = mp.tasks.vision.RunningMode
    base_options = python.BaseOptions(model_asset_path='pose_landmarker.task')
    options = vision.PoseLandmarkerOptions(
        base_options=base_options,
        running_mode=VisionRunningMode.LIVE_STREAM,
        output_segmentation_masks=True,
        result_callback=self.process_frame
    )
    self.detector = vision.PoseLandmarker.create_from_options(options)
    self.results = None

  def process_frame(self, result: vision.PoseLandmarkerResult, output_image: mp.Image, timestamp_ms: int):
    self.results = result

  def update(self, image):
    mpImage = mp.Image(image_format=mp.ImageFormat.SRGB, data=image) ### numpy image to mpImage
    # detect poses in the input image.
    self.detector.detect_async(mpImage, self.stamp)
    # ### calculate time
    self.stamp = self.stamp + 1
    # process the result. In this case, visualize it.
    pose_landmarks = None
    resultImage = self.draw(mpImage.numpy_view(), self.results) ##mpImage back to numpy image
    maskImage = self.createMask(mpImage.numpy_view(), self.results)
    return resultImage, maskImage
    
  def draw(self, image, detection_result):
    annotated_image = image.copy()
    # Loop through the detected poses to visualize.
    if detection_result:
      pose_landmarks_list = detection_result.pose_landmarks
      for idx in range(len(pose_landmarks_list)):
        pose_landmarks = pose_landmarks_list[idx]
        # Draw the pose landmarks.
        pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        pose_landmarks_proto.landmark.extend([
          landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in pose_landmarks
        ])
        solutions.drawing_utils.draw_landmarks(
          annotated_image,
          pose_landmarks_proto,
          solutions.pose.POSE_CONNECTIONS,
          solutions.drawing_styles.get_default_pose_landmarks_style())
    return annotated_image
  
  def createMask(self, image, detection_result):
    visualized_mask = image.copy()
    ### if detection exists, create propr mask image
    try:
      segmentation_mask = detection_result.segmentation_masks[0].numpy_view()
      visualized_mask = cv2.convertScaleAbs(segmentation_mask, alpha=1.00)
      ### inflate mask
      kernel = np.ones((25,25),np.uint8)
      visualized_mask = cv2.dilate(visualized_mask, kernel, iterations=5)
    ### else, create a white image based on the input image
    except:
      img_h, img_w, cannel = image.shape
      visualized_mask = np.zeros((img_h, img_w), dtype=np.uint8)
      visualized_mask[:] = 255
    return visualized_mask
#############################

cap = cv2.VideoCapture(0)
# cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
# cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
cap.set(cv2.CAP_PROP_FPS, 20)
fps = 20
### instantiate gesture recognizer
d = HumanPoseDetector()
### read images from camera and feed into recognizer class
while cap.isOpened():
  success, image = cap.read()
  if success:
    human, mask = d.update(image)
    result=human
    # result = cv2.bitwise_and(human, human, mask=mask)
    ### for some reason, some cv2 operations flip the colors, so we flip the back
    result  = result[:, :, ::-1]
    cv2.imshow("ResultMasked", result)
  ### exit condition is random keypress
  if cv2.waitKey(1) != -1:
    break
cv2.destroyAllWindows()
cap.release()
print("Done")


Done


## combine pose detection with gesture detection

In [50]:
from mediapipe.python.solutions.drawing_utils import _normalized_to_pixel_coordinates
import cv2

def clamp(minimum, x, maximum):
    return max(minimum, min(x, maximum))

def showInMovedWindow(winname, img, x, y):
    cv2.namedWindow(winname)        # Create a named window
    cv2.moveWindow(winname, x, y)   # Move it to (x,y)
    cv2.imshow(winname,img)

class HandRegion:
    def __init__(self, center, r):
        self.center = center
        self.r = r
    def cropFrom(self, image):
        cropped = image.copy()
        x,y = self.center
        # print(x,y,self.r)
        img_h, img_w, cannel = image.shape
        w1 = clamp(0, int(x - self.r*1.75), img_w)
        w2 = clamp(0, int(x + self.r*1.75), img_w)
        h1 = clamp(0, int(y - self.r*1.75), img_h)
        h2 = clamp(0, int(y + self.r*1.75), img_h)
        cropped_image = cropped[h1:h2, w1:w2]
        return cropped_image
    def zoomInto(self, image):
        ### adaptive zoom based on r with k = 10
        k = 10.0
        zoomfactor = clamp(1.5, (np.sqrt(1.0 / self.r) * k), 2.0) 
        zoomed = zoom_at(image, zoomfactor, )
        # print(zoomfactor)
        return zoomed

class SpecialHandsOrientedHumanPoseDetector(HumanPoseDetector):
    def __init__(self):
        super().__init__()
    def update(self, image):        
        mpImage = mp.Image(image_format=mp.ImageFormat.SRGB, data=image) ### numpy image to mpImage
        # detect poses in the input image.
        self.detector.detect_async(mpImage, self.stamp)
        # ### calculate time
        self.stamp = self.stamp + 1
        # process the result. In this case, visualize it.
        pose_landmarks = None
        resultImage, leftHand, rightHand = self.draw(mpImage.numpy_view(), self.results) ##mpImage back to numpy image
        maskImage = self.createMask(mpImage.numpy_view(), self.results)
        return resultImage, maskImage, leftHand, rightHand
    def draw(self, image, detection_result):
        ### call base class draw
        annotated_image = super().draw(image, detection_result)
        ### do something more
        hand_annotaded_image = annotated_image.copy()
        leftHandRegion = None
        rightHandRegion = None   
        if detection_result:
            img_h, img_w, cannel = annotated_image.shape
            ### crawl through landmarks, find the hand wrists and draw a rect around them
            #### from: https://github.com/google/mediapipe/blob/master/docs/solutions/pose.md
            ###        https://camo.githubusercontent.com/7fbec98ddbc1dc4186852d1c29487efd7b1eb820c8b6ef34e113fcde40746be2/68747470733a2f2f6d65646961706970652e6465762f696d616765732f6d6f62696c652f706f73655f747261636b696e675f66756c6c5f626f64795f6c616e646d61726b732e706e67
            ###
            ### the left hand wrist index is: 15
            ### the right hand wrist index is: 16
            ### the left hand index index is: 19
            ### the right hand index index is: 20
            ### the left hand pinky index is: 17
            ### the right hand pinky index is: 18
            pose_landmarks_list = detection_result.pose_landmarks
            for idx in range(len(pose_landmarks_list)):
                pose_landmarks = pose_landmarks_list[idx]
                ### do rect around all landmarks
                coords = [_normalized_to_pixel_coordinates(l.x, l.y, img_w, img_h) for l in pose_landmarks]
                ### grab hand landmark values before filtering
                handLeftVal = None
                handRightVal = None
                handLeftR = None
                handRightR = None
                ### get point between wrist and pinky
                idxl1, idxl2 = (15, 17)
                idxr1, idxr2 = (16, 18)
                ### Left Hand
                try:
                    ### get coordinate between wrist and pinky
                    handLeftVal = np.mean([coords[idxl1],coords[idxl2]], axis=0).astype(int)
                    ### get dist between wrist and pinky
                    handLeftR = np.linalg.norm(np.array(coords[idxl1])-np.array(coords[idxl2]))
                except Exception as e:
                    ### hopeyfully only fires when coords[x] is None
                    # print(e)
                    pass
                ### Right Hand
                try:
                    handRightR = np.linalg.norm(np.array(coords[idxr1])-np.array(coords[idxr2]))
                    handRightVal = np.mean([coords[idxr1],coords[idxr2]], axis=0).astype(int)
                except Exception as e:
                    ### hopeyfully only fires when coords[x] is None
                    # print(e)
                    pass
                ### filter none values for non existent landmarks
                coords = np.array(list(filter(lambda i: i is not None, coords)))
                try:
                    ### not necessary, so we tr catch this
                    rect = cv2.minAreaRect(np.array(coords))
                    box = cv2.boxPoints(rect)
                    box = np.intp(box)
                    cv2.drawContours(hand_annotaded_image,[box],0,(0,0,255),2)
                except:
                    pass
                ### do rect around both hands
                ### calculate radius based on distance between     
                if handLeftVal is not None and handLeftR is not None:
                    ### radius safety factor = 4.5 
                    handLeftR = int(handLeftR * 4.5)
                    ### radius minimum size = 100 pixel, max size = 500 pixels
                    handLeftR = clamp(100, handLeftR, 500)
                    cv2.circle(hand_annotaded_image, handLeftVal, handLeftR, (255,0,0), 2)
                    leftHandRegion = HandRegion(handLeftVal, handLeftR)
                if handRightVal is not None and handRightR is not None:
                    ### radius safety factor = 4.5 
                    handRightR = int(handRightR * 4.5)
                    ### radius minimum size = 100 pixel, max size = 500 pixels
                    handRightR = clamp(100, handRightR, 500)
                    cv2.circle(hand_annotaded_image, handRightVal, handRightR, (0,255,0), 2)
                    rightHandRegion = HandRegion(handRightVal, handRightR)

        return hand_annotaded_image, leftHandRegion, rightHandRegion


In [47]:
import cv2

cap = cv2.VideoCapture(0)
# cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
# cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
cap.set(cv2.CAP_PROP_FPS, 20)
fps = 20
### instantiate gesture recognizer
r = GestureRecognizer()
### instantiate normal human ose detector
# d = HumanPoseDetector()
### or instantiate special human pose detector which alos draws rects around the hands
d = SpecialHandsOrientedHumanPoseDetector()
### read images from camera and feed into recognizer class
while cap.isOpened():
  success, image = cap.read()
  if success:
    ### crop image to nearest side, so that it gets rectangular and the focus is tn the middle of the image
    cropped = cropToSmallestSide(image)
    # zoomed = zoom_at(cropped, 1.7) #1.3
    # filtered = cv2.GaussianBlur(zoomed,(5,5),0)
    human, mask, leftHandRegion, rightHandRegion = d.update(cropped)
    ### use the masked image to mask out everything except for the human body
    # maskedHuman = cv2.bitwise_and(human, human, mask=mask)
    # resultImage = r.update(maskedHuman)
    ### or use the normal one without masking
    resultImage = r.update(human)
    ### show result
    if leftHandRegion != None:
      leftHandImage = leftHandRegion.cropFrom(cropped)
      leftHandImage = leftHandRegion.zoomInto(leftHandImage)
      showInMovedWindow("LeftHandVideo",leftHandImage, 50, 10)
    if rightHandRegion != None:
      rightHandImage = rightHandRegion.cropFrom(cropped)
      rightHandImage = rightHandRegion.zoomInto(rightHandImage)
      showInMovedWindow("RightHandVideo",rightHandImage, 1300, 10)
    ### draw combine annotaed image
    showInMovedWindow("AnnotatedVideo", resultImage, 600, 10)
  ### exit condition is random keypress
  if cv2.waitKey(1) != -1:
    break
cv2.destroyAllWindows()
cap.release()
print("Done")

W20230910 03:00:07.092574 945647 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I20230910 03:00:07.094765 945647 hand_gesture_recognizer_graph.cc:249] Custom gesture classifier is not defined.


Done


## now use that combination and to zoom in on the hands and apply the gesture classification on the hands only

In [56]:
import cv2

cap = cv2.VideoCapture(0)
# cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
# cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
cap.set(cv2.CAP_PROP_FPS, 20)
fps = 20
### instantiate gesture recognizer
r1 = GestureRecognizer(num_hands = 1)
r2 = GestureRecognizer(num_hands = 1)
### instantiate normal human ose detector
# d = HumanPoseDetector()
### or instantiate special human pose detector which alos draws rects around the hands
d = SpecialHandsOrientedHumanPoseDetector()
### read images from camera and feed into recognizer class
while cap.isOpened():
  success, image = cap.read()
  if success:
    ### crop image to nearest side, so that it gets rectangular and the focus is tn the middle of the image
    cropped = cropToSmallestSide(image)
    # zoomed = zoom_at(cropped, 1.7) #1.3
    # filtered = cv2.GaussianBlur(zoomed,(5,5),0)
    human, mask, leftHandRegion, rightHandRegion = d.update(cropped)
    ### use the masked image to mask out everything except for the human body
    # maskedHuman = cv2.bitwise_and(human, human, mask=mask)
    # resultImage = r.update(maskedHuman)
    ### cut out the hands and feed the dedicated hand images into the gesture recognition
    if leftHandRegion != None:
      leftHandImage = leftHandRegion.cropFrom(cropped)
      leftHandImage = leftHandRegion.zoomInto(leftHandImage)
    if rightHandRegion != None:
      rightHandImage = rightHandRegion.cropFrom(cropped)
      rightHandImage = rightHandRegion.zoomInto(rightHandImage)
    
    ### or use the normal one without masking
    resultImage1 = r1.update(leftHandImage)
    resultImage2 = r2.update(rightHandImage)
    showInMovedWindow("LeftHandVideo",resultImage1, 50, 10)
    showInMovedWindow("RightHandVideo",resultImage2, 1300, 10)
    ### show result
    ### draw combine annotaed image
    showInMovedWindow("AnnotatedVideo", human, 600, 10)
  ### exit condition is random keypress
  if cv2.waitKey(1) != -1:
    break
cv2.destroyAllWindows()
cap.release()
print("Done")

W20230910 03:13:30.127545 945647 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I20230910 03:13:30.130802 945647 hand_gesture_recognizer_graph.cc:249] Custom gesture classifier is not defined.
W20230910 03:13:30.170953 945647 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I20230910 03:13:30.171610 945647 hand_gesture_recognizer_graph.cc:249] Custom gesture classifier is not defined.


Done
