In [1]:
from tensorflow import keras
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import time
from tensorflow.keras.preprocessing import image
import numpy as np
import tkinter as tk
from tkinter import Label,Frame
from PIL import Image, ImageTk
from keras.models import load_model
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D,BatchNormalization
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.regularizers import l2
from tkinter import Label
from textblob import TextBlob


In [2]:

# Create a new Sequential model
model = Sequential()

# Load pre-trained MobileNetV2 model without the top layers
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze the base model layers

# Add base model, Global Average Pooling, Dropout, and Dense layers
model.add(base_model)
model.add(GlobalAveragePooling2D())
# model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(26, activation='softmax',kernel_regularizer=l2(0.01)))

# Build the model explicitly with the correct input shape
model.build(input_shape=(None, 224, 224, 3))

# Load the weights from the file
model.load_weights('asl_cnn_model_final.keras')


In [65]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [66]:
model.summary()

In [67]:


# Initialize Mediapipe Hand module
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Load trained model
model = model

# Define function to preprocess the white_frame
def preprocess_image(img, target_size=(224, 224)):  # Adjust target_size as per your model input
    # Resize the img to the target size
    img_resized = cv2.resize(img, target_size)
    
    # Convert the resized img to an array
    img_array = image.img_to_array(img_resized)
    
    # Normalize the img array (scale pixel values to the range 0-1)
    img_array = img_array / 255.0
    
    # Expand dimensions to create a batch of size 1
    img_array = np.expand_dims(img_array, axis=0)
    
    return img_array

# Define function to make prediction
def predict_hand_sign(img):
    confidence_threshold=0.6
    # Preprocess the img
    preprocessed_img = preprocess_image(img)
    
    # Make prediction
    prediction = model.predict(preprocessed_img)
    
    # Get the predicted class index
    predicted_class_index = np.argmax(prediction)
    
    # Get the confidence of the predicted class (the highest probability)
    predicted_confidence = prediction[0, predicted_class_index]
    
    # Check if the confidence is above the threshold
    if predicted_confidence >= confidence_threshold:
        # Map the class index to the corresponding letter (A to Z)
        predicted_class = chr(ord('A') + predicted_class_index)
        
        # Return the predicted letter
        return predicted_class
    else:
        # If confidence is below threshold, return None to ignore the prediction
        return None

# Function to detect hand using Mediapipe Hand module and create skeleton frame
def detect_hand(frame):
    # Convert BGR image to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process frame with Mediapipe Hand module
    results = hands.process(rgb_frame)
    
    # Create white background frame
    white_frame = np.ones_like(frame) * 255
    
    # Draw hand landmarks and connections on original frame
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw landmarks
            for landmark in hand_landmarks.landmark:
                # Get landmark position
                x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                
                # Draw landmark on original frame
                cv2.circle(frame, (x, y), 4, (0, 0, 255), -1)
            
            # Draw connections between landmarks on original frame
            connections = mp_hands.HAND_CONNECTIONS
            for connection in connections:
                idx1, idx2 = connection
                x1, y1 = int(hand_landmarks.landmark[idx1].x * frame.shape[1]), int(hand_landmarks.landmark[idx1].y * frame.shape[0])
                x2, y2 = int(hand_landmarks.landmark[idx2].x * frame.shape[1]), int(hand_landmarks.landmark[idx2].y * frame.shape[0])
                cv2.line(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    
     # Assuming results.multi_hand_landmarks is already defined as your hand landmarks
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # First draw all connections with green lines
            connections = mp_hands.HAND_CONNECTIONS
            for connection in connections:
                start_idx, end_idx = connection
                start_landmark = hand_landmarks.landmark[start_idx]
                end_landmark = hand_landmarks.landmark[end_idx]
                start_point = (int(start_landmark.x * white_frame.shape[1]), int(start_landmark.y * white_frame.shape[0]))
                end_point = (int(end_landmark.x * white_frame.shape[1]), int(end_landmark.y * white_frame.shape[0]))
                cv2.line(white_frame, start_point, end_point, (0, 255, 0), 2)

            # Then draw all landmarks with red outline circles on top of the green lines
            for landmark in hand_landmarks.landmark:
                # Get landmark position
                x, y = int(landmark.x * white_frame.shape[1]), int(landmark.y * white_frame.shape[0])

                # Draw landmark as a small red circle (outline only) on top of green lines
                cv2.circle(white_frame, (x, y), 1, (0, 0, 255), 1)  # Increase the thickness as needed

    return frame, white_frame







In [68]:

##Autocorrector
def autocorrect_word(word):
    # Create a TextBlob object
    blob = TextBlob(word)
    
    # Perform autocorrection
    corrected_word = blob.correct()
    
    return str(corrected_word)

In [69]:
# function for text to speech conversion
from tkinter import ttk
import pyttsx3

In [70]:
#text to speech
def text_to_speech(text):
    # Initialize the text-to-speech engine
    engine = pyttsx3.init()
    
    # Set properties such as rate and volume if desired
    engine.setProperty('rate', 150)  # Words per minute
    engine.setProperty('volume', 0.8)  # Volume (0.0 to 1.0)
    
    # Convert the text to speech
    engine.say(text)
    
    # Wait until the speech is finished
    engine.runAndWait()

In [76]:
time_delay=2.0
space_time_delay=4.0

class HandSignRecognitionApp:
    def __init__(self, root):
        self.root = root
        
        # Set the title and size of the main window
        self.root.title("Hand Sign Recognition")
        self.root.geometry("800x600")

        # Create a frame to organize the layout
        self.frame = Frame(self.root)
        self.frame.grid(row=0, column=0, sticky='nsew')

        # Create a label to display the webcam video feed
        self.video_label = Label(self.frame)
        self.video_label.grid(row=0, column=0, padx=10, pady=10, sticky='nsew')

        # Create a label to display the white frame with hand landmarks
        self.landmarks_label = Label(self.frame)
        self.landmarks_label.grid(row=0, column=1, padx=10, pady=10, sticky='nsew')

        # Configure row and column weights to allow resizing
        self.frame.columnconfigure(0, weight=1)
        self.frame.columnconfigure(1, weight=1)
        self.frame.rowconfigure(0, weight=1)

        # Create a label to display the predicted hand sign
        self.prediction_label = Label(self.root, font=('Arial', 20))
        self.prediction_label.grid(row=1, column=0, columnspan=2, pady=10)

        # Open a connection to the webcam (0 is the default camera)
        self.cap = cv2.VideoCapture(0)
        
        # Speak buttons
        self.speak_button = ttk.Button(root, text="Speak", command=self.speak_current_word)
        self.speak_button.grid(row=1, column=20)  # Adjust row and column as needed
        
        #
        self.current_word = ""
        self.last_pred_time = None

        # Start processing the video feed
        self.update()
        
    def speak_current_word(self):
        # Call the text_to_speech function with the current word
        text_to_speech(self.current_word)

    def update(self):
        # Capture a frame from the webcam
        ret, frame = self.cap.read()
        
        if ret:
            # Detect hand and create white frame
            _, white_frame = detect_hand(frame)
            
            # Make prediction on the white frame
            predicted_letter = predict_hand_sign(white_frame)
            
            
            if predicted_letter is not None:
                predicted_letter=predicted_letter.lower()
                current_time = time.time()
                # Calculate elapsed time since last accepted prediction
                if self.last_pred_time is not None:
                    elapsed_time = current_time - self.last_pred_time
                else:
                    elapsed_time = float('inf')  # Set to infinity if last_pred_time is None

                # Check if enough time has passed since the last accepted prediction
                if elapsed_time >= time_delay:
                    # Check if it's time to add a space and autocorrect the word
                    if elapsed_time >= space_time_delay:
                        # Split the current_word into words
                        words = self.current_word.split()
                        if words:
                            # Autocorrect the last word
                            last_word = words.pop()
                            corrected_word = autocorrect_word(last_word)
                            print(corrected_word)
                            # Reassemble the current_word with corrected last word and a space
                            self.current_word = ' '.join(words) + ' ' + corrected_word + ' '
                        else:
                            # Add space if no words
                            self.current_word += ' '
            
                    # Add the predicted letter to current_word
                    self.current_word += predicted_letter
                    print(f"Adding predicted letter: {predicted_letter}")
                
                    # Update the last prediction time
                    self.last_pred_time = current_time
            
            
            # Update the prediction label
            self.prediction_label.config(text=f'current word: {self.current_word}')

            # Convert the webcam video frame to a Tkinter-compatible format
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img_pil = Image.fromarray(frame_rgb)
            img_tk = ImageTk.PhotoImage(img_pil)
            
            # Update the video label with the new frame
            self.video_label.config(image=img_tk)
            self.video_label.image = img_tk
            
            # Convert the white frame to a Tkinter-compatible format
            white_frame_rgb = cv2.cvtColor(white_frame, cv2.COLOR_BGR2RGB)
            white_img_pil = Image.fromarray(white_frame_rgb)
            white_img_tk = ImageTk.PhotoImage(white_img_pil)
            
            # Update the landmarks label with the new white frame
            self.landmarks_label.config(image=white_img_tk)
            self.landmarks_label.image = white_img_tk
        
        # Schedule the update function to run again after a short delay
        self.root.after(30, self.update)

    def on_close(self):
        # Release the webcam when the application is closed
        self.cap.release()
        self.root.destroy()


# Create the main application window
root = tk.Tk()

# Create an instance of the application class
app = HandSignRecognitionApp(root)

# Set up a callback function to run when the window is closed
root.protocol("WM_DELETE_WINDOW", app.on_close)

# Start the application main loop
root.mainloop()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Adding predicted letter: k
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Adding predicted letter: z
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━