In [3]:
import cv2
from cvzone.HandTrackingModule import HandDetector
from cvzone.ClassificationModule import Classifier
import numpy as np
import math
import tensorflow as tf
import time
import os
import google.generativeai as genai
from gtts import gTTS 
import platform

# Load the model
model = tf.keras.models.load_model("Model/keras_model.h5")

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
classifier = Classifier("Model/keras_model.h5", "Model/labels.txt")
offset = 20
imgSize = 300

labels = ["peace", "good luck", "again", "no", "help", "yes", "hello", "busy", "happy", "i", "you", "bye"]

# Initialize an empty list to store the predictions
predictions = []

# Set the duration and interval for capturing gestures
duration = 10  # Total duration in seconds
interval = 3  # Capture interval in seconds
  
# Start time
start_time = time.time()

# Google Generative AI configuration
genai.configure(api_key="")

generation_config = {
  "temperature": 0.9,
  "top_p": 1,
  "top_k": 0,
  #"max_output_tokens": 20,
  "response_mime_type": "text/plain",
}
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
]

genai_model = genai.GenerativeModel(
  model_name="gemini-1.0-pro",
  #safety_settings=safety_settings,
  generation_config=generation_config,
)

def get_meaningful_sentence(predictions):
    prompt = f"I am giving you the following keywords: {predictions}, frame a logical and sensible sentence used in every day life."
    chat_session = genai_model.start_chat(history=[])
    response = chat_session.send_message(prompt)
    return response.text.strip()

def text_to_speech(text):
    tts = gTTS(text=text, lang='en')
    tts.save("output.mp3")
    if platform.system() == "Windows":
        os.system("start output.mp3")
    elif platform.system() == "Darwin":  # macOS
        os.system("afplay output.mp3")
    elif platform.system() == "Linux":
        os.system("mpg321 output.mp3")

while True:
    # Check if the total duration has passed
    elapsed_time = time.time() - start_time
    if elapsed_time > duration:
        break

    success, img = cap.read()
    imgOutput = img.copy()
    hands, img = detector.findHands(img)
    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']

        imgWhite = np.ones((imgSize, imgSize, 3), np.uint8) * 255

        imgCrop = img[max(0, y-offset):min(y + h + offset, img.shape[0]), max(0, x-offset):min(x + w + offset, img.shape[1])]
                      
        imgCropShape = imgCrop.shape 

        aspectRatio = h / w

        if aspectRatio > 1:
            k = imgSize / h
            wCal = math.ceil(k * w)
            imgResize = cv2.resize(imgCrop, (wCal, imgSize))
            imgResizeShape = imgResize.shape
            wGap = math.ceil((imgSize - wCal) / 2)
            imgWhite[:, wGap: wCal + wGap] = imgResize
            prediction, index = classifier.getPrediction(imgWhite, draw=False)
        else:
            k = imgSize / w
            hCal = math.ceil(k * h)
            imgResize = cv2.resize(imgCrop, (imgSize, hCal))
            imgResizeShape = imgResize.shape
            hGap = math.ceil((imgSize - hCal) / 2)
            imgWhite[hGap: hCal + hGap, :] = imgResize
            prediction, index = classifier.getPrediction(imgWhite, draw=False)

        # Capture the prediction every interval seconds
        if int(elapsed_time) % interval == 0:
            predictions.append(labels[index])  # Append the label to predictions
            # Wait to ensure the gesture is not captured multiple times within the same second
            time.sleep(1)

        cv2.rectangle(imgOutput, (x - offset, y - offset - 70), (x - offset + 400, y - offset + 60 - 50), (0, 255, 0), cv2.FILLED)
        cv2.putText(imgOutput, labels[index], (x, y - 30), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 0), 2)
        cv2.rectangle(imgOutput, (x - offset, y - offset), (x + w + offset, y + h + offset), (0, 255, 0), 4)

       # cv2.imshow('ImageCrop', imgCrop)
       # cv2.imshow('ImageWhite', imgWhite)

    cv2.imshow('Image', imgOutput)
    cv2.waitKey(1)

# Release the video capture object and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

# Use the predictions to generate a meaningful sentence and convert it to speech
print(predictions)
text = get_meaningful_sentence(predictions)
print(text)
text_to_speech(text)





['i', 'bye', 'bye']
"I bid you farewell, goodbye."
