## Importin Libraries

In [1]:
import cv2
import json
import time
import numpy as np
import mediapipe as mp
import tensorflow as tf
from collections import deque
from concurrent.futures import ThreadPoolExecutor

tf.get_logger().setLevel('ERROR')




## MediaPipe Implementation

In [2]:
filtered_hand = list(range(21))
filtered_pose = [0, 2, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

HAND_NUM = len(filtered_hand)
POSE_NUM = len(filtered_pose)

In [3]:
hands = mp.solutions.hands.Hands()
pose = mp.solutions.pose.Pose()

def get_all_landmarks(frame):
    
    all_landmarks = np.zeros((HAND_NUM * 2 + POSE_NUM, 3))
    
    def get_hands(frame):
        results_hands = hands.process(frame)
        if results_hands.multi_hand_landmarks:
            for i, hand_landmarks in enumerate(results_hands.multi_hand_landmarks):
                if results_hands.multi_handedness[i].classification[0].index == 0: 
                    all_landmarks[:HAND_NUM, :] = np.array(
                        [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # right
                else:
                    all_landmarks[HAND_NUM:HAND_NUM * 2, :] = np.array(
                        [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # left

    def get_pose(frame):
        results_pose = pose.process(frame)
        if results_pose.pose_landmarks:
            all_landmarks[HAND_NUM * 2:HAND_NUM * 2 + POSE_NUM, :] = np.array(
                [(lm.x, lm.y, lm.z) for lm in results_pose.pose_landmarks.landmark])[filtered_pose]
        
    with ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(get_hands, frame)
        executor.submit(get_pose, frame)

    return all_landmarks

In [4]:
def get_hands_landmarks(frame):
    results_hands = hands.process(frame)
    hand_lm = np.zeros((HAND_NUM * 2, 3))
    if results_hands.multi_hand_landmarks:
        for i, hand_landmarks in enumerate(results_hands.multi_hand_landmarks):
            if results_hands.multi_handedness[i].classification[0].index == 0: 
                hand_lm[:HAND_NUM, :] = np.array(
                    [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # right
            else:
                hand_lm[HAND_NUM:HAND_NUM * 2, :] = np.array(
                    [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # left
    return hand_lm

## Load Trained Model

In [5]:
# load gesture model 
gest = tf.lite.Interpreter(model_path="gesture_model.tflite")
gest.allocate_tensors()
gest_input = gest.get_input_details()
gest_output = gest.get_output_details()
mapping = {
    0: 'closedfist',
    1: 'four',
    2: 'openpalm',
    3: 'pointup',
    4: 'three',
    5: 'thumbsdown',
    6: 'thumbsup',
    7: 'victory'
}

In [6]:
gloss_mapping_path = "590_gloss_mapping.json"
index_gloss_mapping_path = "590_index_gloss_mapping.json"
index_label_mapping_path = "590_index_label_mapping.json"

gloss_mapping = json.load(open(gloss_mapping_path, "r"))
index_gloss_mapping = json.load(open(index_gloss_mapping_path, "r"))
index_label_mapping = json.load(open(index_label_mapping_path, "r"))

In [7]:
model_path = 'model.tflite'
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

In [8]:
def predict_sign(input_data):
    input_data = np.expand_dims(input_data, axis=0).astype(np.float32)
    interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data)
    interpreter.invoke()
    output = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
    return output

In [9]:
input_shape = list(map(int, interpreter.get_input_details()[0]['shape']))
output_shape = list(map(int, interpreter.get_output_details()[0]['shape']))
input_shape, output_shape

([1, 120, 55, 3], [1, 590])

---

## 1. Test Live Feed (Sign & Gesture)

---

In [23]:
mode = 'gesture'
text = ''
gesture = ''
cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
sequence = deque(maxlen=input_shape[1])
for _ in range(input_shape[1]):
    sequence.append(np.zeros((input_shape[2], 3)))
step_length = 60
TIME_PER_STEP = step_length / 30.0
step_time = time.time()
frame_time = 0
step = []
while True:
    ret, frame = cap.read()
    if not ret: continue
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_rgb.flags.writeable = False
    if mode == 'sign':
        fps = str(int(1 / (time.time() - frame_time)))
        frame_time = time.time()
        

            
        all_landmarks = get_all_landmarks(frame_rgb)
        hand_landmarks = all_landmarks[:42,:].reshape(-1,42,3)
        gest.set_tensor(gest_input[0]['index'], np.array(hand_landmarks, dtype=np.float32))
        gest.invoke()
        output_data = gest.get_tensor(gest_output[0]['index'])
        predicted_class = np.argmax(output_data)
        gesture = mapping[predicted_class]
        if gesture == 'thumbsup':
            mode = 'gesture'
            print(f'Mode Switched to {mode}')
        cv2.putText(frame, f'recognised sign is ({text})', (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 240, 0), 2, cv2.LINE_AA)
        cv2.putText(frame, f'Mode : {mode}', (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 240, 0), 2, cv2.LINE_AA)
            
        step.append(all_landmarks)
    
        if time.time() - step_time >= TIME_PER_STEP:
            step = np.array(step)
            step = np.apply_along_axis(lambda arr: np.interp(np.linspace(0, 1, step_length),
                                                             np.linspace(0, 1, arr.shape[0]), arr),
                                       axis=0, arr=step)
    
            
            sequence.extend(step)
            prediction = predict_sign(np.array(sequence))
            prediction = prediction.reshape(-1)
            prediction = prediction.argmax()
            sign = index_label_mapping[str(prediction)]
            print(f'Sign : {sign} and Gesture : {gesture}')
            text = sign
            step_time = time.time()
            step = []
    if mode == 'gesture':
        hand_landmarks = get_hands_landmarks(frame_rgb).reshape(-1,42,3)
        gest.set_tensor(gest_input[0]['index'], np.array(hand_landmarks, dtype=np.float32))
        gest.invoke()
        output_data = gest.get_tensor(gest_output[0]['index'])
        predicted_class = np.argmax(output_data)
        gesture = mapping[predicted_class]
        text = gesture
        if gesture == 'victory':
            mode = 'sign'
            print(f'Mode Switched to {mode}')
            
        cv2.putText(frame, f'recognised gesture is ({text})', (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 240, 0), 2, cv2.LINE_AA)
        cv2.putText(frame, f'Mode : {mode}', (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 240, 0), 2, cv2.LINE_AA)
        
        
        
    cv2.imshow("Test", frame)
    cv2.setWindowProperty("Test", cv2.WND_PROP_TOPMOST, 1)
    k = cv2.waitKey(1)
    if k == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

Mode Switched to sign
Sign : sorry and Gesture : victory
Mode Switched to gesture
Mode Switched to sign
Sign : cheap and Gesture : victory
Mode Switched to gesture
Mode Switched to sign
Sign : seven and Gesture : victory
Sign : pizza and Gesture : thumbsdown
Sign : three and Gesture : thumbsdown
Sign : None and Gesture : thumbsdown
Sign : None and Gesture : thumbsdown
Sign : None and Gesture : thumbsdown
Sign : earn and Gesture : thumbsdown
Sign : arrive and Gesture : openpalm
Sign : money and Gesture : thumbsdown
Sign : buy and Gesture : thumbsdown
Sign : earn and Gesture : thumbsdown
Sign : money and Gesture : openpalm
Sign : park and Gesture : thumbsdown
Sign : become and Gesture : thumbsdown
Sign : bother and Gesture : thumbsdown
Sign : school and Gesture : four
Sign : money and Gesture : thumbsdown
Sign : money and Gesture : thumbsdown
Sign : park and Gesture : openpalm
Sign : friday and Gesture : thumbsdown
Sign : outside and Gesture : thumbsdown
