In [None]:
import cv2
import numpy as np
import tensorflow as tf
import pyttsx3
import mediapipe as mp
import time
import math

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

# Class names for hand gestures
class_names = ['hello','i love you','man','namaste','no','okay','please','thank you','welcome','yes']

# Settings
imgsize = 300  # Size of the final square image
offset = 20  # Padding around the hand bounding box
gesture_gap = 1.0  # Minimum time between gesture inputs (in seconds)
pause_duration = 3.0  # Duration to form a sentence after no gestures (in seconds)

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Function to predict hand gesture from an image
def predict_hand_gesture(model, img, class_names):
    img_height, img_width = 64, 64  # Update to match the model's expected input size
    img = cv2.resize(img, (img_height, img_width))  # Resize cropped hand to model input size
    img_array = np.expand_dims(img, axis=0) / 255.0  # Normalize and add batch dimension

    prediction = model.predict(img_array)
    predicted_label = class_names[np.argmax(prediction)]
    return predicted_label

# Function to convert text to speech
def speak_text(text):
    engine.say(text)
    engine.runAndWait()

# Function to detect hand, draw landmarks, crop, and resize the hand region
def detect_and_process_hand(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB (required by MediaPipe)
    results = hands.process(frame_rgb)  # Process the frame to detect hands

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw landmarks on the frame
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Get the bounding box coordinates
            h, w, _ = frame.shape
            x_min, x_max = w, 0
            y_min, y_max = h, 0

            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                x_min, x_max = min(x, x_min), max(x, x_max)
                y_min, y_max = min(y, y_min), max(y, y_max)

            # Expand the bounding box slightly for a better crop
            x_min = max(0, x_min - offset)
            y_min = max(0, y_min - offset)
            x_max = min(w, x_max + offset)
            y_max = min(h, y_max + offset)

            # Draw the bounding box on the original frame
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

            # Crop the hand region from the frame
            cropped_hand = frame[y_min:y_max, x_min:x_max]

            # Resize the cropped hand to a square image while maintaining aspect ratio
            img_white = np.ones((imgsize, imgsize, 3), np.uint8) * 255
            aspect_ratio = (y_max - y_min) / (x_max - x_min)

            if aspect_ratio > 1:
                # Height is greater than width
                k = imgsize / (y_max - y_min)
                w_cal = math.ceil(k * (x_max - x_min))
                img_resized = cv2.resize(cropped_hand, (w_cal, imgsize))
                w_gap = math.ceil((imgsize - w_cal) / 2)
                img_white[:, w_gap:w_gap + w_cal] = img_resized
            else:
                # Width is greater than height
                k = imgsize / (x_max - x_min)
                h_cal = math.ceil(k * (y_max - y_min))
                img_resized = cv2.resize(cropped_hand, (imgsize, h_cal))
                h_gap = math.ceil((imgsize - h_cal) / 2)
                img_white[h_gap:h_gap + h_cal, :] = img_resized

            return img_white, True  # Return the resized hand image
    return frame, False  # Return the original frame if no hand is detected

# Function to capture video and process each frame
def process_video(model, class_names):
    cap = cv2.VideoCapture(0)  # Capture video from webcam
    
    if not cap.isOpened():
        print("Error: Could not open video stream.")
        return
    
    sentence = []
    last_gesture_time = 1
    start_time = time.time()

    while True:
        ret, frame = cap.read()  # Capture frame-by-frame
        if not ret:
            break

        # Detect hand and get the processed hand image
        processed_hand, hand_detected = detect_and_process_hand(frame)

        if hand_detected and time.time() - last_gesture_time > gesture_gap:
            # Predict gesture on the processed hand
            predicted_label = predict_hand_gesture(model, processed_hand, class_names)
            print(f'Predicted hand gesture: {predicted_label}')
            
            # Add to sentence list
            sentence.append(predicted_label)
            last_gesture_time = time.time()  # Update the last gesture time

            # Display the frame with predicted label (optional)
            cv2.putText(frame, predicted_label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)

        # If no new gesture is detected after the pause duration, form the sentence and speak it
        if len(sentence) > 0 and time.time() - last_gesture_time > pause_duration:
            sentence_str = ' '.join(sentence)
            speak_text(sentence_str)
            print(f'Sentence spoken: {sentence_str}')
            
            sentence = []  # Clear the sentence after speaking
            start_time = time.time()  # Reset timer

        # Display the frame with the landmarks and bounding box
        cv2.imshow('Hand Gesture Detection', frame)

        # Press 'q' to quit the video capture
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the capture and close windows
    cap.release()
    cv2.destroyAllWindows()

# Example usage
# Load the model before passing it to the function
model = tf.keras.models.load_model("C:\\Users\\kumar\\OneDrive\\Desktop\\model\\MODEL_3.keras")

# Process the video with the loaded model
process_video(model, class_names)
