In [1]:
import cv2
import csv
import numpy as np
import pandas as pd
import tensorflow as tf

from cv2.typing import MatLike

from non_maximum_suppression import non_max_suppression_fast

print(f'OpenCV: {cv2.__version__}')
print(f'CSV: {csv.__version__}')
print(f'Numpy: {np.__version__}')
print(f'Pandas: {pd.__version__}')
print(f'TensorFlow: {tf.__version__}')

OpenCV: 4.8.1
CSV: 1.0
Numpy: 1.26.2
Pandas: 2.1.1
TensorFlow: 2.15.0


In [2]:
WINDOW = 'Hand Tracking'
PALM_MODEL_PATH = '../models/palm_detection_full.tflite'
ANCHORS_PATH = '../MediaPipeSSDAnchors/anchors.csv'
DETECTION_THRESHOLD = 0.5
BOX_ENLARGE_FACTOR = 1.5
BOX_SHIFT_FACTOR = 0.2

In [3]:
csv_file = open(ANCHORS_PATH, 'r')
anchors = np.r_[
  [x for x in csv.reader(csv_file, quoting=csv.QUOTE_NONNUMERIC)]
]

print(f'Anchors shape: {anchors.shape}')
print(f'Anchors: {anchors[:5]}')

Anchors shape: (2016, 4)
Anchors: [[0.02083333 0.02083333 1.         1.        ]
 [0.02083333 0.02083333 1.         1.        ]
 [0.0625     0.02083333 1.         1.        ]
 [0.0625     0.02083333 1.         1.        ]
 [0.10416667 0.02083333 1.         1.        ]]


In [4]:
interpreter = tf.lite.Interpreter(model_path=PALM_MODEL_PATH)
interpreter.allocate_tensors()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [5]:
input_details = interpreter.get_input_details()

for detail in input_details:
    print(pd.DataFrame.from_dict(detail, orient='index'))

                                                                         0
name                                                               input_1
index                                                                    0
shape                                                     [1, 192, 192, 3]
shape_signature                                           [1, 192, 192, 3]
dtype                                              <class 'numpy.float32'>
quantization                                                      (0.0, 0)
quantization_parameters  {'scales': [], 'zero_points': [], 'quantized_d...
sparsity_parameters                                                     {}


In [6]:
IMG_INPUT_SIZE = tuple(input_details[0]['shape'][1:3])

print(f'Image input size: {IMG_INPUT_SIZE}')

Image input size: (192, 192)


In [7]:
output_details = interpreter.get_output_details()

for detail in output_details:
    print(pd.DataFrame.from_dict(detail, orient='index'))

                                                                         0
name                                                              Identity
index                                                                  279
shape                                                        [1, 2016, 18]
shape_signature                                              [1, 2016, 18]
dtype                                              <class 'numpy.float32'>
quantization                                                      (0.0, 0)
quantization_parameters  {'scales': [], 'zero_points': [], 'quantized_d...
sparsity_parameters                                                     {}
                                                                         0
name                                                            Identity_1
index                                                                  276
shape                                                         [1, 2016, 1]
shape_signature          

In [8]:
def preprocess_img(frame: MatLike):
    # Original Size
    shape = np.r_[frame.shape]

    # Calculate Padding
    pad = (shape.max() - shape[:2]).astype('uint32') // 2

    # Pad Image
    img_pad = np.pad(frame, ((pad[0], pad[0]), (pad[1], pad[1]), (0, 0)), 'constant')

    # Resize Image
    img_small = cv2.resize(img_pad, IMG_INPUT_SIZE)
    
    # Set as contiguous array (speed up)
    img_small = np.ascontiguousarray(img_small)

    img_norm = (img_small / 255).astype('float32')
    
    return img_pad, img_norm, pad

In [9]:
def run_inference(img_norm):
    # Add Batch dimension
    img_input = np.expand_dims(img_norm, axis=0)

    # Set input tensor
    interpreter.set_tensor(input_details[0]['index'], img_input)

    # Run Inference
    interpreter.invoke()

    # Get Output
    """
    output_reg shape is [number of anchors, 18]
    Second dimension 0 - 4 are bounding box offset, width and height: dx, dy, w ,h
    Second dimension 4 - 18 are 7 hand keypoint x and y coordinates: x1,y1,x2,y2,...x7,y7
    """
    output_reg = interpreter.get_tensor(output_details[0]['index'])[0]

    """
    output_clf shape is [number of anchors]
    it is the classification score if there is a hand for each anchor box
    """
    output_clf = interpreter.get_tensor(output_details[1]['index'])[0]

    return output_reg, output_clf

In [10]:
def process_output(output_reg, output_clf):
    def _sigm(x):
      return 1 / (1 + np.exp(-x) )
    
    def _get_triangle(self, kp0, kp2, dist=1):
      """get a triangle used to calculate Affine transformation matrix"""

      # 90° rotation matrix used to create the alignment trianlge
      R90 = np.r_[[[0,1],[-1,0]]]

      dir_v = kp2 - kp0
      dir_v /= np.linalg.norm(dir_v)

      dir_v_r = dir_v @ R90.T
      return np.float32([kp2, kp2+dir_v*dist, kp2 + dir_v_r*dist])

    # Get probabilities
    probabilities = _sigm(output_clf)
    detection_mask = (probabilities > DETECTION_THRESHOLD).flatten()
    candidate_detect = output_reg[detection_mask]
    candidate_anchors = anchors[detection_mask]
    probabilities = probabilities[detection_mask]

    if candidate_detect.shape[0] == 0:
      return None, None, None

    # Pick the best bounding box with non maximum suppression
    # the boxes must be moved by the corresponding anchor first
    moved_candidate_detect = candidate_detect.copy()
    moved_candidate_detect[:, :2] = candidate_detect[:, :2] + (candidate_anchors[:, :2] * IMG_INPUT_SIZE[0])
    box_ids = non_max_suppression_fast(moved_candidate_detect[:, :4], probabilities)

    # Pick the first detected hand. Could be adapted for multi hand recognition
    box_ids = box_ids[0]

    # bounding box offsets, width and height
    dx,dy,w,h = candidate_detect[box_ids, :4][0]
    # center_wo_offst = candidate_anchors[box_ids,:2] * IMG_INPUT_SIZE[0]

    # # 7 initial keypoints
    # keypoints = center_wo_offst + candidate_detect[box_ids,4:].reshape(-1,2)
    # side = max(w,h) * BOX_ENLARGE_FACTOR

    # # now we need to move and rotate the detected hand for it to occupy a
    # # IMG_INPUT_SIZE square
    # # line from wrist keypoint to middle finger keypoint
    # # should point straight up
    # # TODO: replace triangle with the bbox directly
    # source = _get_triangle(keypoints[0], keypoints[2], side)
    # source -= (keypoints[0] - keypoints[2]) * BOX_SHIFT_FACTOR

    # debug_info = {
    #   "detection_candidates": candidate_detect,
    #   "anchor_candidates": candidate_anchors,
    #   "selected_box_id": box_ids,
    # }

    # return source, keypoints, debug_info


In [11]:
cv2.startWindowThread()
cv2.namedWindow(WINDOW)
capture = cv2.VideoCapture(1)

while capture.isOpened():
  hasFrame, frame = capture.read()
  
  if hasFrame:
    img_pad, img_norm, pad = preprocess_img(frame)
    shape = np.r_[img_pad.shape]
    output_reg, output_clf = run_inference(img_norm)
    source, keypoints, debug_info = process_output(output_reg, output_clf)

    cv2.imshow(WINDOW, img_pad)
  
  if cv2.waitKey(10) & 0xFF == ord('q'):
    break

capture.release()
cv2.destroyAllWindows()

# Fix for macos bug when closing window
for i in range(4):
    cv2.waitKey(1)



[ 1.7048548 -3.7318597 21.25844   21.258087 ]
1.7048548


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.

: 