In [1]:
import cv2
import mediapipe as mp
import numpy as np

2025-06-11 10:59:10.787315: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 10:59:11.145049: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-11 10:59:11.323916: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-11 10:59:11.377336: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-11 10:59:11.709101: I tensorflow/core/platform/cpu_feature_guar

In [2]:
mp_hands =mp.solutions.hands
mp.drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)

I0000 00:00:1749632356.384037    8573 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1749632356.392254   10882 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: AMD Radeon 780M (radeonsi, gfx1103_r1, LLVM 19.1.1, DRM 3.61, 6.11.0-26-generic)


In [3]:
import numpy as np

def get_gesture(landmarks):
    def distance(a, b):
        return np.linalg.norm(np.array([a.x, a.y]) - np.array([b.x, b.y]))

    def norm_distance(a, b):
        return distance(a, b) / ref_len

    def is_finger_extended(tip, pip):
        return tip.y < pip.y

    def is_finger_curled(tip, pip, dip):
        return tip.y > pip.y and dip.y > pip.y

    # Landmark indices
    wrist = landmarks[0]
    thumb_tip, thumb_ip = landmarks[4], landmarks[3]
    index_tip, index_dip, index_pip = landmarks[8], landmarks[7], landmarks[6]
    middle_tip, middle_dip, middle_pip = landmarks[12], landmarks[11], landmarks[10]
    ring_tip, ring_dip, ring_pip = landmarks[16], landmarks[15], landmarks[14]
    pinky_tip, pinky_dip, pinky_pip = landmarks[20], landmarks[19], landmarks[18]
    middle_mcp = landmarks[9]

    # Normalize distances
    ref_len = distance(wrist, middle_mcp) + 1e-6

    d_thumb = norm_distance(wrist, thumb_tip)
    d_index = norm_distance(wrist, index_tip)
    d_middle = norm_distance(wrist, middle_tip)
    d_ring = norm_distance(wrist, ring_tip)
    d_pinky = norm_distance(wrist, pinky_tip)

    thumb_index_dist = norm_distance(thumb_tip, index_tip)

    # Finger states
    index_extended = is_finger_extended(index_tip, index_pip)
    middle_extended = is_finger_extended(middle_tip, middle_pip)
    ring_extended = is_finger_extended(ring_tip, ring_pip)
    pinky_extended = is_finger_extended(pinky_tip, pinky_pip)

    index_curled = is_finger_curled(index_tip, index_pip, index_dip)
    middle_curled = is_finger_curled(middle_tip, middle_pip, middle_dip)
    ring_curled = is_finger_curled(ring_tip, ring_pip, ring_dip)
    pinky_curled = is_finger_curled(pinky_tip, pinky_pip, pinky_dip)

    # === Gesture Logic ===

    # ✊ Fist: All fingers curled
    if all([index_curled, middle_curled, ring_curled, pinky_curled]) and d_index < 0.25:
        return "fist"

    # ✋ Palm: All fingers extended and spread
    elif all([index_extended, middle_extended, ring_extended, pinky_extended]) and min(d_index, d_middle, d_ring, d_pinky) > 0.35:
        return "palm"

    # 👍 Like: Thumb extended, others curled
    elif d_thumb > 0.4 and all([index_curled, middle_curled, ring_curled, pinky_curled]):
        return "like"

    # ✌️ Peace: Index & middle extended, ring & pinky curled
    elif index_extended and middle_extended and ring_curled and pinky_curled:
        return "peace"

    # 👌 OK: Thumb & index tips close, middle extended
    elif thumb_index_dist < 0.2 and middle_extended:
        return "ok"

    # 🤙 Call Me: Thumb & pinky extended, others curled
    elif d_thumb > 0.3 and d_pinky > 0.3 and all([index_curled, middle_curled, ring_curled]):
        return "call"

    return "unknown"

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [4]:
def process_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError("Image not found or unable to read.")

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)

    gesture = "no hands"

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Crop and zoom
            zoomed_image = crop_and_resize_hand(image, hand_landmarks.landmark)
            zoomed_rgb = cv2.cvtColor(zoomed_image, cv2.COLOR_BGR2RGB)

            # Re-run MediaPipe on zoomed hand
            refined_results = hands.process(zoomed_rgb)

            if refined_results.multi_hand_landmarks:
                refined_landmarks = refined_results.multi_hand_landmarks[0]
                gesture = get_gesture(refined_landmarks.landmark)
            else:
                gesture = get_gesture(hand_landmarks.landmark)

    return gesture

In [5]:
def crop_and_resize_hand(image, landmarks, size=(256, 256), margin=40):
    h, w, _ = image.shape
    xs = [lm.x * w for lm in landmarks]
    ys = [lm.y * h for lm in landmarks]

    min_x = int(max(min(xs) - margin, 0))
    max_x = int(min(max(xs) + margin, w))
    min_y = int(max(min(ys) - margin, 0))
    max_y = int(min(max(ys) + margin, h))

    cropped = image[min_y:max_y, min_x:max_x]
    if cropped.size == 0:
        return image  # fallback if crop failed
    return cv2.resize(cropped, size)

In [6]:
__path__ = "dataset/hagrid-sample-500k-384p/hagrid_500k"

from tqdm import tqdm

def main():
    import os
    counter = 0
    correct = 0
    filename = ["train_val_call", "train_val_fist", "train_val_like", "train_val_ok", "train_val_palm", "train_val_peace"]

    for file in filename:
        label = file[10:]
        counter_label = 0
        correct_label = 0
        folder_path = os.path.join(__path__, file)
        image_files = [f for f in os.listdir(folder_path) if f.endswith(".jpg") or f.endswith(".png")]

        for f in tqdm(image_files, desc=f"Processing {label}", unit="img"):
            image_path = os.path.join(folder_path, f)
            gesture = process_image(image_path)

            if gesture == label:
                correct += 1
                correct_label += 1
            counter += 1
            counter_label += 1

        print(f"\nTotal images for {label}: {counter_label}")
        print(f"Correct gestures for {label}: {correct_label}")
        print(f"Accuracy for {label}: {correct_label / counter_label * 100:.2f}%\n")

    print(f"Total images processed: {counter}")
    print(f"Correct gestures: {correct}")
    print(f"Accuracy: {correct / counter * 100:.2f}%")

if __name__ == "__main__":
    main()


W0000 00:00:1749632356.438124   10858 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1749632356.468076   10864 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
Processing call:   0%|          | 0/28193 [00:00<?, ?img/s]W0000 00:00:1749632356.527751   10871 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
Processing call: 100%|██████████| 28193/28193 [32:45<00:00, 14.34img/s]



Total images for call: 28193
Correct gestures for call: 3987
Accuracy for call: 14.14%



Processing fist: 100%|██████████| 27764/27764 [30:10<00:00, 15.33img/s]



Total images for fist: 27764
Correct gestures for fist: 2
Accuracy for fist: 0.01%



Processing like: 100%|██████████| 27721/27721 [32:02<00:00, 14.42img/s]



Total images for like: 27721
Correct gestures for like: 6293
Accuracy for like: 22.70%



Processing ok: 100%|██████████| 27999/27999 [30:08<00:00, 15.48img/s]



Total images for ok: 27999
Correct gestures for ok: 14969
Accuracy for ok: 53.46%



Processing palm: 100%|██████████| 28326/28326 [37:23<00:00, 12.63img/s] 



Total images for palm: 28326
Correct gestures for palm: 23666
Accuracy for palm: 83.55%



Processing peace: 100%|██████████| 28303/28303 [29:13<00:00, 16.14img/s]


Total images for peace: 28303
Correct gestures for peace: 22107
Accuracy for peace: 78.11%

Total images processed: 168306
Correct gestures: 71024
Accuracy: 42.20%



