# Dynamic Hand Gestures Recognition Inference

In [None]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow import keras
import time
import mediapipe as mp
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

In [None]:
from huggingface_hub import hf_hub_download

REPO_ID = "Seba213/rgb-dhgr-dataset"
DYNAMIC_MODEL = "saved_model.tflite"
model_path = hf_hub_download(repo_id=REPO_ID, filename=DYNAMIC_MODEL, repo_type="dataset")

Replace with your own path

In [None]:
# Path to the saved model
TFLITE_MODEL_PATH = model_path
# Path to the folder containing the captured frames
CAPTURED_FRAMES = ".../captured_frames"
# Path to the folder containing the cropped frames
CROPPED_FRAMES = ".../cropped_frames"
# Path to the folder containing the resized frames
RESIZED_FRAMES = ".../resized_frames"

Setting the classes labels map

In [None]:
label_mapping = {
    'scroll_down': 0,
    'scroll_left': 1,
    'scroll_right': 2,
    'scroll_up': 3,
    'zoom_in': 4,
    'zoom_out': 5
}

index_to_label = {v: k for k, v in label_mapping.items()}

feature_extractor = keras.applications.InceptionV3(
    weights='imagenet',
    include_top=False,
    pooling='avg'  
)

INCEPTION_SIZE = (75, 75)

In [25]:
def crop_image_sides(image, pixels=20):
    '''
    Crop the input image by a specified number of pixels on both the left and right sides.

    Args:
        image (np.array): The input image.
        pixels (int): The number of pixels to crop from each side.

    Returns:
        np.array: The cropped image.
    '''
    height, width, _ = image.shape
    cropped_image = image[:, pixels:width-pixels]
    return cropped_image

In [26]:
# Load the gesture recognition model as a TFLite model
interpreter = tf.lite.Interpreter(model_path=TFLITE_MODEL_PATH)
interpreter.allocate_tensors()
# Get input and output tensor details from the TFLite model
input_details = interpreter.get_input_details()    # Expected shape: [1, 10, 2048]
output_details = interpreter.get_output_details()  # Expected shape: [1, num_classes]

In [None]:
cap  = cv2.VideoCapture(0) 
# Setting the size of the frame
cap_width = 160
cap_height = 120

cap.set(cv2.CAP_PROP_FRAME_WIDTH, cap_width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, cap_height)

if not cap.isOpened():
    raise RuntimeError("Could not open webcam.")

countdown_time = 5  
start_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame = cv2.flip(frame, 1)  # Mirror the frame

    elapsed = time.time() - start_time
    remaining = int(countdown_time - elapsed) + 1  
    if remaining <= 0:
        break

    cv2.putText(frame, f"Recording starts in {remaining}...", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

print("Recording gesture...")

cv2.putText(frame, "Recording gesture...", (50, 50),
            cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)

frames_captured = 0
captured_frames = []  # List to store the raw frames

while frames_captured < 40:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        continue

    frame = cv2.flip(frame, 1)  # Mirror the frame

    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    captured_frames.append(frame.copy())
    cv2.imwrite(f"{CAPTURED_FRAMES}/frame_{frames_captured}.png", frame)

    frames_captured += 1

cap.release()
cv2.destroyAllWindows()

selected_frames = captured_frames[::4]  # Take every 4th frame

features_list = []

for idx, frame in enumerate(selected_frames):
    # Crop the frame to remove the sides
    cropped_image = crop_image_sides(frame, pixels=80)
    cv2.imwrite(f"{CROPPED_FRAMES}/frame_{idx}.png", cropped_image)
    # Resize frame for InceptionV3
    frame_resized = cv2.resize(cropped_image, INCEPTION_SIZE)
    cv2.imwrite(f"{RESIZED_FRAMES}/frame_{idx}.png", frame_resized)

    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)

    image_tensor = tf.convert_to_tensor(frame_rgb, dtype=tf.float32)
    print(f"Preprocessed frame shape: {image_tensor.shape}")

    preprocessed_frame = np.expand_dims(image_tensor, axis=0)

    features = feature_extractor.predict(preprocessed_frame)
    print(f"Extracted features shape: {features.shape}")

    features_list.append(features[0])

if not features_list:
    raise ValueError("No valid frames were processed.")

features_array = np.array(features_list, dtype='float32')
print("Captured features shape:", features_array.shape)

input_for_lstm = np.expand_dims(features_array, axis=0)

# Inference
input_data = input_for_lstm.astype(input_details[0]['dtype'])
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]['index'])
predicted_probabilities = output_data[0]

print("Predicted probabilities:")
for i, prob in enumerate(predicted_probabilities):
    print(f"{index_to_label[i]}: {prob:.4f}")


Requested FPS: 5
Actual FPS: 30.0
Recording gesture...
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━