In [1]:
# pip install mediapipe opencv-python numpy

In [2]:
# pip install scikit-learn tensorflow

<p>min_detection_confidence = 0.7</p>
<ul>
  <li>If confidence drops below <strong>0.5</strong>, MediaPipe will stop tracking that hand until it’s detected again.</li>
  <li><span style="color:green;">Higher values (e.g., 0.8)</span> → more accuracy, but the hand may “disappear” more often if detection isn’t strong.</li>
  <li><span style="color:orange;">Lower values (e.g., 0.3)</span> → more tolerant, but increases the chance of false positives or “ghost” hands.</li>
</ul>

<h2>multi_hand_landmarks</h2>
<p>Contains all the hands it found. Since we only allow 1 hand, this will have <strong>0 or 1</strong>.</p>

<h2>How MediaPipe Hand Landmarks Work</h2>
<p>MediaPipe gives us 21 points per hand, each with:</p>
<ul>
  <li><code>x</code> = horizontal position (0 = left, 1 = right)</li>
  <li><code>y</code> = vertical position (0 = top, 1 = bottom)</li>
  <li><code>z</code> = depth (not important for this part)</li>
</ul>

<p>The points are numbered in a fixed order:</p>
<table border="1" cellpadding="5" style="border-collapse:collapse;">
  <tr>
    <th>Finger</th>
    <th>Indices</th>
    <th>Tip</th>
  </tr>
  <tr><td>Thumb</td><td>[1, 2, 3, 4]</td><td>4</td></tr>
  <tr><td>Index</td><td>[5, 6, 7, 8]</td><td>8</td></tr>
  <tr><td>Middle</td><td>[9, 10, 11, 12]</td><td>12</td></tr>
  <tr><td>Ring</td><td>[13, 14, 15, 16]</td><td>16</td></tr>
  <tr><td>Pinky</td><td>[17, 18, 19, 20]</td><td>20</td></tr>
</table>

<p><img src="img.ppm.png" alt="MediaPipe Hand Landmarks" style="max-width:100%;border:1px solid #ccc;border-radius:4px;"></p>


<h1><i>Code</i></h1>

In [5]:
import cv2
import mediapipe as mp #landmark detection
import math
import os # In order to turn off noisy log messages

# Shows only important errors
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    max_num_hands=1, # Detecting only 1 hand at a time
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5 # For tracking the detected hand
)

# Opens webcam
cap = cv2.VideoCapture(0)

# measures the distance between two points, like the thumb tip and index tip.
# Pythagoras formula: √((x1−x2)² + (y1−y2)²)
def distance(point1, point2):
    return math.sqrt((point1.x - point2.x)**2 + (point1.y - point2.y)**2)

# List to store gesture history
gesture_history = [] 
prev_gesture = None # It is for storing the last gesture, so i don’t log the same thing repeatedly if i hold it still.

# Put while to get manual stop instead time
while True: 
    ret, frame = cap.read()  # takes 1 frame from the cam
    if not ret:
        print("Failed to grab frame") # if failed, the program stops
        break

    frame = cv2.flip(frame, 1)  # Flips the video horizontally so it feels like a mirror
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # OpenCV uses BGR (by default), but MediaPipe uses RGB. So this swaps 
    results = hands.process(frame_rgb)

    gesture = "No Hand" # Default until one is detected

    if results.multi_hand_landmarks: 
        for hand_landmarks in results.multi_hand_landmarks:
            # Draws dots for the joints and lines connecting them
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS) 
            lm = hand_landmarks.landmark

            # Each finger has numbered points
            thumb_tip = lm[4]
            index_tip = lm[8]
            middle_tip = lm[12]
            ring_tip = lm[16]
            pinky_tip = lm[20]

            
            # These lines check if the tip is lower (y is bigger) than the joint below it
            # Lower = finger folded down
            # Higher = finger up
            # thumb_up is a special case since the thumb bends differently
            index_folded = index_tip.y > lm[6].y
            middle_folded = middle_tip.y > lm[10].y
            ring_folded = ring_tip.y > lm[14].y
            pinky_folded = pinky_tip.y > lm[18].y
            thumb_up = thumb_tip.y < lm[3].y
            thumb_folded = thumb_tip.y > lm[3].y

            # rules
            if thumb_up and index_folded and middle_folded and ring_folded and pinky_folded:
                gesture = "Thumbs Up"
            elif not index_folded and not middle_folded and ring_folded and pinky_folded:
                gesture = "Peace"
            elif index_folded and middle_folded and ring_folded and pinky_folded and thumb_folded:
                gesture = "Fist"
            elif not index_folded and not middle_folded and not ring_folded and not pinky_folded and not thumb_up:
                gesture = "Open Hand"
            elif distance(thumb_tip, index_tip) < 0.05:
                gesture = "OK"

    # Record gesture if it is different from the previous one
    if gesture != prev_gesture:
        gesture_history.append(gesture)
        prev_gesture = gesture

    # Draws the name of the gesture on the top-left of the frame
    cv2.putText(frame, f'Gesture: {gesture}', (10,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

    # display last 5 gestures
    for i, g in enumerate(gesture_history[-5:]):
        cv2.putText(frame, g, (10, 100 + i*30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,200,255), 2)

    cv2.imshow("Hand Gesture Recognition", frame) # Opens a separate window with the processed video

    if cv2.waitKey(1) & 0xFF == 27:  # ESC to quit
        break

cap.release() # turns off the camera
cv2.destroyAllWindows() # close windows
hands.close() # release resources

# Print full gesture history
print("Gesture History in Order:")
print(gesture_history)


I0000 00:00:1755110093.120357  646763 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1755110093.137765  646953 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755110093.144709  646953 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755110094.676286  646956 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


Gesture History in Order:
['No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'Thumbs Up', 'No Hand', 'Thumbs Up', 'No Hand', 'Thumbs Up', 'No Hand', 'Thumbs Up', 'No Hand', 'Thumbs Up', 'Fist', 'No Hand', 'Thumbs Up', 'No Hand', 'Thumbs Up', 'No Hand', 'Thumbs Up', 'No Hand', 'OK', 'Thumbs Up', 'Peace', 'No Hand', 'Thumbs Up', 'Fist', 'Thumbs Up', 'Fist', 'Thumbs Up', 'Fist', 'Thumbs Up', 'Fist', 'Thumbs Up', 'No Hand', 'Fist', 'Thumbs Up', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'OK', 'No Hand', 'Thumbs Up', 'Peace', 'Thumbs Up', 'Fist', 'Thumbs Up', 'Fist', 'Thumbs Up', 'Fist', 'No Hand', 'Fist', 'Thumbs Up', 'No Hand', 'Peace', 'Thumbs Up', 'No Hand']
