In [2]:
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
from collections import deque

# =========================
# PARAMETERS
# =========================
MODEL_PATH = "asl_final_model.h5"
IMG_SIZE = 128  # Must match training
CLASSES = ['A','B','C','D','E','F','G','H','I','J','K',
           'L','M','N','O','P','Q','R','S','T','U','V',
           'W','X','Y','Z','del','nothing','space']

# =========================
# LOAD MODEL
# =========================
model = load_model(MODEL_PATH)

# =========================
# MEDIAPIPE HANDS SETUP
# =========================
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

# =========================
# WEBCAM & SENTENCE BUILDER
# =========================
cap = cv2.VideoCapture(0)
sentence = ""
letter_queue = deque(maxlen=15)  # Smooth prediction over last 15 frames

print("Press 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            h, w, c = frame.shape
            x_coords = [lm.x for lm in hand_landmarks.landmark]
            y_coords = [lm.y for lm in hand_landmarks.landmark]
            x_min, x_max = int(min(x_coords)*w), int(max(x_coords)*w)
            y_min, y_max = int(min(y_coords)*h), int(max(y_coords)*h)

            margin = 20
            x_min, y_min = max(x_min-margin,0), max(y_min-margin,0)
            x_max, y_max = min(x_max+margin,w), min(y_max+margin,h)

            hand_img = frame[y_min:y_max, x_min:x_max]
            if hand_img.size > 0:
                hand_img = cv2.resize(hand_img, (IMG_SIZE, IMG_SIZE))
                hand_img = hand_img.astype("float32") / 255.0
                hand_img = np.expand_dims(hand_img, axis=0)

                pred = model.predict(hand_img, verbose=0)
                label = CLASSES[np.argmax(pred)]
                conf = np.max(pred)

                letter_queue.append(label)

                # Take the most frequent letter in queue
                if len(letter_queue) == letter_queue.maxlen:
                    final_letter = max(set(letter_queue), key=letter_queue.count)
                    if final_letter == "space":
                        sentence += " "
                    elif final_letter == "del":
                        sentence = sentence[:-1]
                    elif final_letter != "nothing":
                        sentence += final_letter
                    letter_queue.clear()  # Reset queue after confirming letter

                cv2.putText(frame, f"Letter: {label} ({conf:.2f})", (x_min, y_min-10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

    # Display sentence
    cv2.putText(frame, f"Sentence: {sentence}", (10, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255,0,0), 2)

    cv2.imshow("ASL Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Press 'q' to quit.


In [4]:
import os
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from collections import deque
import time

# ======================
# 1. LOAD MODEL
# ======================
MODEL_PATH = "asl_final_model.h5"
model = tf.keras.models.load_model(MODEL_PATH)

# Load class labels dynamically (same as training)
dataset_path = "asl_alphabet_train"  # your dataset folder
class_labels = sorted(os.listdir(dataset_path))  # all subfolders
print("✅ Classes:", class_labels)

IMG_SIZE = 128

# ======================
# 2. MEDIAPIPE SETUP
# ======================
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

# ======================
# 3. SETTINGS
# ======================
frame_buffer = deque(maxlen=15)   # smooth prediction over last 20 frames
sentence = ""
CONF_THRESHOLD = 0.7

# ======================
# 4. PREDICTION FUNCTION
# ======================
def predict_hand(frame, bbox):
    x, y, w, h = bbox
    if w <= 0 or h <= 0:
        return None, None

    hand_img = frame[y:y+h, x:x+w]
    if hand_img.size == 0:
        return None, None

    hand_img = cv2.resize(hand_img, (IMG_SIZE, IMG_SIZE))
    hand_img = hand_img.astype("float32") / 255.0
    hand_img = np.expand_dims(hand_img, axis=0)

    preds = model.predict(hand_img, verbose=0)
    class_id = int(np.argmax(preds))
    confidence = float(preds[0][class_id])
    return class_labels[class_id], confidence

# ======================
# 5. WEBCAM LOOP
# ======================
cap = cv2.VideoCapture(0)
prev_time = time.time()

print("Press 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            h, w, _ = frame.shape
            x_min = int(min([lm.x for lm in hand_landmarks.landmark]) * w) - 20
            y_min = int(min([lm.y for lm in hand_landmarks.landmark]) * h) - 20
            x_max = int(max([lm.x for lm in hand_landmarks.landmark]) * w) + 20
            y_max = int(max([lm.y for lm in hand_landmarks.landmark]) * h) + 20

            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(w, x_max), min(h, y_max)

            bbox = (x_min, y_min, x_max - x_min, y_max - y_min)

            label, conf = predict_hand(frame, bbox)

            if label is not None and conf >= CONF_THRESHOLD:
                frame_buffer.append(label)

                # Take majority vote when buffer is full
                if len(frame_buffer) == frame_buffer.maxlen:
                    final_label = max(set(frame_buffer), key=frame_buffer.count)

                    if final_label == "space":
                        sentence += " "
                    elif final_label == "del":
                        sentence = sentence[:-1]
                    elif final_label != "nothing":
                        sentence += final_label

                    frame_buffer.clear()

                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                cv2.putText(frame, f"{label} ({conf:.2f})", (x_min, y_min - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Show sentence
    cv2.putText(frame, f"Sentence: {sentence}", (10, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 0, 0), 2)

    # FPS counter
    curr_time = time.time()
    fps = 1 / (curr_time - prev_time)
    prev_time = curr_time
    cv2.putText(frame, f"FPS: {int(fps)}", (10, 90),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)

    cv2.imshow("ASL Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


✅ Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
Press 'q' to quit.
