In [99]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Dropout
import os
import xml.etree.ElementTree as ET
from tensorflow.keras.applications import MobileNetV2


In [48]:
image_dir = 'data_set/images/'
annotation_dir = 'data_set/Annotations/'
classes = {"okay":0,"thank you":1,"hello":2}

In [64]:
def sparse_voc_annotations(image_dir, annotation_dir,classes):
    img_paths = []
    class_labels = []
    bboxes = []
    for xml_file in sorted(os.listdir(annotation_dir)):
        if not xml_file.endswith('.xml'):
             continue
            # print(f"Processing {xml_file}")
        tree = ET.parse(os.path.join(annotation_dir, xml_file))
        print(tree)
        root = tree.getroot()
        img_name = root.find('filename').text
        img_path = os.path.join(image_dir, img_name)
        size = root.find('size')
        width = int(size.find('width').text)
        height = int(size.find('height').text)

        objects = root.find('object')
        if object is not None:
            class_name = objects.find('name').text
            if class_name not in classes:
                continue
            class_id = classes[class_name]

            bndbox = objects.find('bndbox')
            xmin = float(bndbox.find('xmin').text)
            ymin = float(bndbox.find('ymin').text)
            xmax = float(bndbox.find('xmax').text)
            ymax = float(bndbox.find('ymax').text)
            img_paths.append(img_path)
            class_labels.append(class_id)
            bboxes.append([xmin, ymin, xmax, ymax])
    return img_paths, class_labels, bboxes

In [73]:
img_path,class_names,bbox =sparse_voc_annotations(image_dir, annotation_dir, classes)

<xml.etree.ElementTree.ElementTree object at 0x000001E4C6282C00>
<xml.etree.ElementTree.ElementTree object at 0x000001E4D489AC90>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.ElementTree object at 0x000001E4D489AC90>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.ElementTree object at 0x000001E4D489AC90>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C627EB10>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C627EB10>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C627EB10>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C627EB10>
<xml.etree.ElementTree.ElementTree object at 0x000001E4C5E9B1A0>
<xml.etree.ElementTree.El

In [66]:
img_path =tf.constant(img_path)
class_names = tf.constant(class_names,dtype=tf.int32)
bbox = tf.constant(bbox,dtype=tf.float32)

In [83]:
def load_and_preprocess_image(img_path, bbox,class_names):
    image = tf.io.read_file(img_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image.set_shape( [None, None, 3])
    image = tf.image.resize(image, [224, 224])
    image = tf.cast(image, tf.float32) / 255.0
    class_encoded =tf.one_hot(class_names, depth=len(classes), dtype=tf.int32)
    bbox = tf.convert_to_tensor(bbox, dtype=tf.float32)
    # class_names = tf.convert_to_tensor(class_names, dtype=tf.int32)
    return image,{'bbox': bbox, 'class_names': class_encoded}

dataset = tf.data.Dataset.from_tensor_slices((img_path, bbox, class_names))
dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)
dataset = dataset.shuffle(buffer_size=70)
   

In [84]:
dataset_size =len(img_path)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)


In [110]:
base_model =MobileNetV2(input_shape=(224, 224, 3), include_top=False, weights='imagenet')
base_model.trainable = False
for layer in base_model.layers:
    layer.trainable = True
# for layer in base_model.layers[-20:]:
#     layer.trainable = True

# base_model.trainable = False
x= base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output_class = tf.keras.layers.Dense(len(classes), activation='softmax', name='class_names')(x)
output_bbox = tf.keras.layers.Dense(4, activation='linear', name='bbox')(x)
model = tf.keras.Model(inputs=base_model.input, outputs=[output_class, output_bbox])
model.compile(optimizer='adam',
              loss={'class_names': 'categorical_crossentropy', 'bbox': 'mean_squared_error'},
              metrics={'class_names': 'accuracy', 'bbox': 'mse'})
model.summary()

In [None]:
model.fit(train_dataset,
 validation_data=val_dataset, 
 epochs=25,
 callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True,monitor='val_class_names_accuracy',mode='max'),
 tf.keras.callbacks.ModelCheckpoint('sign_language_model.keras', save_best_only=True, monitor='val_class_names_loss',mode='min')],
 verbose=1)

Epoch 1/25


In [106]:
model =tf.keras.models.load_model("sign_language_model.keras")

In [107]:
import cv2



In [108]:
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise Exception("Could not open webcam")

# Capture loop
while True:
    ret, frame = cap.read()
    if not ret or frame is None:
        print(" Frame capture failed.")
        continue

    resized_frame = cv2.resize(frame, (224, 224))
    input_frame = np.expand_dims(resized_frame, axis=0)
    input_frame = tf.cast(input_frame, tf.float32) / 255.0

    predictions = model.predict(input_frame)
    class_idx = np.argmax(predictions[0][0])
    bbox_pred = predictions[1][0]

    class_label = class_names[class_idx]
    cv2.putText(frame, f'Class: {class_label}', (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
    cv2.rectangle(frame, 
                  (int(bbox_pred[0]), int(bbox_pred[1])), 
                  (int(bbox_pred[2]), int(bbox_pred[3])), 
                  (0, 255, 0), 2)

    cv2.imshow('Sign Language Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 987ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 988ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0