In [1]:
!pip install mediapipe opencv-python scikit-learn




In [21]:
!pip install pyttsx3

Collecting pyttsx3
  Using cached pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Collecting comtypes (from pyttsx3)
  Using cached comtypes-1.4.10-py3-none-any.whl.metadata (7.2 kB)
Collecting pypiwin32 (from pyttsx3)
  Using cached pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
Using cached pyttsx3-2.98-py3-none-any.whl (34 kB)
Using cached comtypes-1.4.10-py3-none-any.whl (241 kB)
Using cached pypiwin32-223-py3-none-any.whl (1.7 kB)
Installing collected packages: pypiwin32, comtypes, pyttsx3
Successfully installed comtypes-1.4.10 pypiwin32-223 pyttsx3-2.98


In [17]:
import pickle

# Save the model
with open('hand_sign_model2.pkl', 'wb') as f:
    pickle.dump(model, f)


In [3]:
import cv2
import numpy as np
import mediapipe as mp
import pyttsx3
import threading
import time
import queue
from sklearn.svm import SVC

# ---------------------- MediaPipe Setup ----------------------
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# ---------------------- Label List (Add "No Gesture") ----------------------
labels = ['No Gesture','Peace', 'hello','chill','nice']
samples_per_label = 150
all_landmarks = []
all_labels = []

# ---------------------- Speech Engine ----------------------
class SpeechEngine:
    def __init__(self):
        self.queue = queue.Queue()
        self.thread = threading.Thread(target=self.run, daemon=True)
        self.thread.start()

    def speak(self, text):
        self.queue.put(text)

    def run(self):
        engine = pyttsx3.init()
        engine.setProperty('rate', 100)
        voices = engine.getProperty('voices')
        for voice in voices:
            if "zira" in voice.name.lower() or "female" in voice.name.lower():
                engine.setProperty('voice', voice.id)
                break
        while True:
            text = self.queue.get()
            print(f"🗣️ Speaking: {text}")
            engine.say(text)
            engine.runAndWait()

speech = SpeechEngine()

# ---------------------- Data Collection ----------------------
cap = cv2.VideoCapture(0)
current_label_index = 0
sample_count = 0
collecting = True

print("👉 Starting data collection...")
print("Press 'n' to move to the next label.")
print("Press 'q' to quit.")

while collecting:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, handLms, mp_hands.HAND_CONNECTIONS)
            landmarks = []
            for lm in handLms.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            if len(landmarks) == 63:
                all_landmarks.append(landmarks)
                all_labels.append(labels[current_label_index])
                sample_count += 1

            cv2.putText(frame,
                        f"Collecting '{labels[current_label_index]}' - Sample {sample_count}/{samples_per_label}",
                        (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

            if sample_count >= samples_per_label:
                current_label_index += 1
                sample_count = 0
                if current_label_index >= len(labels):
                    collecting = False
                break

    cv2.imshow("Hand Sign Data Collection", frame)

    key = cv2.waitKey(1)
    if key == ord('q'):
        collecting = False
        break
    elif key == ord('n'):
        current_label_index += 1
        sample_count = 0
        if current_label_index >= len(labels):
            collecting = False

cap.release()
cv2.destroyAllWindows()
hands.close()

print("✅ Data collection complete. Training model...")

# ---------------------- Train Model ----------------------
X = np.array(all_landmarks)
y = np.array(all_labels)

# Enable probability for confidence filtering
model = SVC(kernel='linear', probability=True)
model.fit(X, y)
print("✅ Model training complete!")

👉 Starting data collection...
Press 'n' to move to the next label.
Press 'q' to quit.
✅ Data collection complete. Training model...
✅ Model training complete!


In [5]:
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0)
last_prediction = None
last_speech_time = time.time()

print("🎥 Starting real-time prediction... Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()

            if landmarks.shape[0] == 63:
                proba = model.predict_proba([landmarks])[0]
                confidence = np.max(proba)
                prediction = model.predict([landmarks])[0]
                gesture_text = str(prediction)

                # Show prediction if confident
                if confidence >= 0.8:  # confidence threshold
                    cv2.putText(frame, f"Gesture: {gesture_text}", (10, 30),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

                    # Speak only if it's a new, confident, intentional gesture
                    if gesture_text != last_prediction and gesture_text != "No Gesture" and time.time() - last_speech_time > 1:
                        if " " in gesture_text:
                            gesture_text = ". ".join(gesture_text.split()) + "."
                        else:
                            gesture_text = f"The gesture is {gesture_text}."

                        speech.speak(gesture_text)
                        last_prediction = prediction
                        last_speech_time = time.time()

                mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.imshow("Real-Time Gesture Prediction with Voice", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
hands.close()

🎥 Starting real-time prediction... Press 'q' to quit.
🗣️ Speaking: The gesture is nice.
🗣️ Speaking: The gesture is Peace.
🗣️ Speaking: The gesture is hello.
🗣️ Speaking: The gesture is Peace.
🗣️ Speaking: The gesture is hello.
🗣️ Speaking: The gesture is Peace.
🗣️ Speaking: The gesture is chill.
