In [1]:
import cv2
import numpy as np
import pandas as pd
import mediapipe as mp
from utils import draw_landmarks, calculate_landmarks
import time
import itertools
import copy
import torch

In [2]:
mp_hands_sol = mp.solutions.hands
mp_hands = mp_hands_sol.Hands(
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5,
)

gesture_recognizer = torch.jit.load('gesture_recognizer.pt')

In [3]:
def get_landmarks(image, keypoints):
    landmark_list = []
    if keypoints.multi_hand_landmarks is not None:
        for hand_landmarks, handedness in zip(keypoints.multi_hand_landmarks, keypoints.multi_handedness):
            landmark_list = calculate_landmarks(image, hand_landmarks)
    return landmark_list

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [4]:
def preprocess_landmarks(landmark_list):
    temp_landmark_list = copy.deepcopy(landmark_list)

    base_x, base_y = 0, 0
    for index, landmark_point in enumerate(temp_landmark_list):
        if index == 0:
            base_x, base_y, base_z = landmark_point[0], landmark_point[1], landmark_point[2]

        temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
        temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y
        temp_landmark_list[index][2] = temp_landmark_list[index][2] - base_z


    temp_landmark_list = list(itertools.chain.from_iterable(temp_landmark_list))

    max_value = max(list(map(abs, temp_landmark_list)))

    def normalize_(n):
        return n / max_value

    temp_landmark_list = list(map(normalize_, temp_landmark_list))

    return temp_landmark_list

In [5]:
poses = ['HOLD', 'GRAB', 'FIST', 'INDEX', 'PEACE', 'OK']

In [6]:
def classify_pose(landmarks):
    if len(landmarks) == 0:
        return 'none'
    landmarks = preprocess_landmarks(landmarks)
    landmarks.append(0)
    landmarks = np.array(landmarks)
    landmarks = torch.from_numpy(landmarks)
    res = gesture_recognizer(landmarks).detach().numpy()
    return poses[np.argmax(res)]

In [7]:
def capture_pose(length=5):
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 800)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 600)
    current_pose = None
    while True:
        ret, image = cap.read()
        hand_kpts = mp_hands.process(image)
        landmarks = get_landmarks(image, hand_kpts)
        image = draw_landmarks(image, landmarks)
        current_pose = classify_pose(landmarks)
        cv2.putText(image, f'Pose: {current_pose}', (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.imshow('frame', image)
        if not ret:
            break
        key = cv2.waitKey(25)
        if key & 0xFF == ord('q'):
            cv2.destroyAllWindows()
            break
    cap.release()
    cv2.destroyAllWindows()

In [8]:
capture_pose()