In [31]:
import cv2
import mediapipe as mp
import numpy as np
import os
from collections import defaultdict

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)

cap = cv2.VideoCapture(0)

gesture_map = {"left":0, "right":1, "up":2, "down":3}

if os.path.exists("landmarks.npy") and os.path.exists("labels.npy"):
    data = list(np.load("landmarks.npy").tolist())
    labels = list(np.load("labels.npy").tolist())
else:
    data, labels = [], []


counter = defaultdict(int)
for lbl in labels:
    
    for g, v in gesture_map.items():
        if v == lbl:
            counter[g] += 1

print("Press 1=Left, 2=Right, 3=Up, 4=Down, q=Quit")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])

            # Keyboard keys to label samples
            key = cv2.waitKey(1) & 0xFF
            if key == ord('1'):
                data.append(landmarks)
                labels.append(gesture_map["left"])
                counter["left"] += 1
                print("Saved LEFT swipe sample → Total:", counter["left"])
            elif key == ord('2'):
                data.append(landmarks)
                labels.append(gesture_map["right"])
                counter["right"] += 1
                print("Saved RIGHT swipe sample → Total:", counter["right"])
            elif key == ord('3'):
                data.append(landmarks)
                labels.append(gesture_map["up"])
                counter["up"] += 1
                print("Saved UP swipe sample → Total:", counter["up"])
            elif key == ord('4'):
                data.append(landmarks)
                labels.append(gesture_map["down"])
                counter["down"] += 1
                print("Saved DOWN swipe sample → Total:", counter["down"])

            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

  
    y0 = 30
    for i, g in enumerate(gesture_map.keys()):
        cv2.putText(frame, f"{g}: {counter[g]}", (10, y0 + i*30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("Hand Tracking", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Save collected data
np.save("landmarks.npy", np.array(data))
np.save("labels.npy", np.array(labels))
print("Data saved! Totals:", dict(counter))

cap.release()
cv2.destroyAllWindows()


Press 1=Left, 2=Right, 3=Up, 4=Down, q=Quit
Saved LEFT swipe sample → Total: 1
Saved LEFT swipe sample → Total: 2
Saved LEFT swipe sample → Total: 3
Saved LEFT swipe sample → Total: 4
Saved LEFT swipe sample → Total: 5
Saved LEFT swipe sample → Total: 6
Saved LEFT swipe sample → Total: 7
Saved LEFT swipe sample → Total: 8
Saved LEFT swipe sample → Total: 9
Saved LEFT swipe sample → Total: 10
Saved LEFT swipe sample → Total: 11
Saved LEFT swipe sample → Total: 12
Saved LEFT swipe sample → Total: 13
Saved LEFT swipe sample → Total: 14
Saved LEFT swipe sample → Total: 15
Saved LEFT swipe sample → Total: 16
Saved LEFT swipe sample → Total: 17
Saved LEFT swipe sample → Total: 18
Saved LEFT swipe sample → Total: 19
Saved LEFT swipe sample → Total: 20
Saved LEFT swipe sample → Total: 21
Saved LEFT swipe sample → Total: 22
Saved LEFT swipe sample → Total: 23
Saved LEFT swipe sample → Total: 24
Saved LEFT swipe sample → Total: 25
Saved LEFT swipe sample → Total: 26
Saved LEFT swipe sample → Tot

In [35]:
import numpy as np

X = np.load("landmarks.npy")
y = np.load("labels.npy")

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Class distribution:", {i: (y==i).sum() for i in range(4)})

X shape: (400, 63)
y shape: (400,)
Class distribution: {0: np.int64(100), 1: np.int64(100), 2: np.int64(100), 3: np.int64(100)}


In [39]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
   ---------------------------------------- 0.0/11.2 MB ? eta -:--:--
   ----------------------- ---------------- 6.6/11.2 MB 33.6 MB/s eta 0:00:01
   ---------------------------------------- 11.2/11.2 MB 33.2 MB/s eta 0:00:00
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
   ---------------------------------------- 0.0/46.2 MB ? eta -:--:--
   -------- ------------------------------- 9.4/46.2 MB 41.9 MB/s eta 0:00:01
   ----------

In [43]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

X = np.load("landmarks.npy")
y = np.load("labels.npy")

print("Dataset shape: ", X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42, stratify = y
)

#Defining the MLP model
mlp = MLPClassifier(
    hidden_layer_sizes = (128, 64), # Two hidden layers
    activation = 'relu',
    solver = 'adam',
    max_iter = 500,
    random_state = 42
)

mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save model
joblib.dump(mlp, "gesture_mlp.pkl")
print("Model saved as gesture_mlp.pkl")

Dataset shape:  (400, 63) (400,)
Accuracy: 0.9875

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97        20
           1       1.00      1.00      1.00        20
           2       0.95      1.00      0.98        20
           3       1.00      1.00      1.00        20

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

Model saved as gesture_mlp.pkl


## Implementation

In [59]:
import cv2
import mediapipe as mp
import numpy as np
import joblib

model = joblib.load("gesture_mlp.pkl")

gesture_map = {0:"left", 1:"right", 2: "up", 3:"down"}

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence = 0.7, min_tracking_confidence = 0.7)

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])

            features = np.array(landmarks).reshape(1,-1)

            pred = model.predict(features)[0]
            gesture_name = gesture_map[pred]

            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            cv2.putText(frame, f"Gesture: {gesture_name}", 
                        (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
                        1, (0,255,0), 2, cv2.LINE_AA)

    cv2.imshow("Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
        
cap.release()
cv2.destroyAllWindows()