Sign Language Image Classification with Vision Transformers

In [7]:
import sys
sys.path.append("/home/studio-lab-user/projects/sign-language-image-detection")

In [8]:
# Install required libraries for ViT and other dependencies
!pip install seaborn              
!pip install transformers        
!pip install tensorflow           
!pip install datasets             
!pip install matplotlib           
!pip install Pillow               
!pip install numpy                
!pip install tensorflow-hub      
!pip install scikit-learn       
!pip install tqdm               




Load dataset

In [12]:
from PIL import Image

# Load an image
img = Image.open("/home/studio-lab-user/projects/sign-language-image-detection/sign_datasets/sign-language-gesture-images-dataset/Gesture Image Data/A/993.jpg")

# Print the image dimensions (width, height)
print(f'Image dimensions: {img.size}')


Image dimensions: (50, 50)


In [None]:
from transformers import ViTForImageClassification, ViTFeatureExtractor
import os

# Dataset path
dataset_path = "/home/studio-lab-user/sign-language-image-detection/sign_datasets/sign-language-gesture-images-dataset/Gesture Image Data"

# Load dataset using Keras
dataset = tf.keras.preprocessing.image_dataset_from_directory(
    dataset_path,
    image_size=(96, 96),  # Resize to 96x96 to match ViT input size
    batch_size=64,
    shuffle=True
)

# Define the normalization function
def normalize_image(image, label):
    # Precomputed mean and std from ImageNet (default for Vision Transformer)
    precomputed_mean = [0.52732987, 0.4507709, 0.41209071]
    precomputed_std = [0.19798545, 0.23537221, 0.26049182]

    # Normalize image
    image = (image / 255.0 - precomputed_mean) / precomputed_std
    return image, label

# Split the dataset into training, validation, and test sets
train_size = int(0.7 * len(dataset))  # 70% for training
val_size = int(0.1 * len(dataset))   # 10% for validation
test_size = len(dataset) - train_size - val_size  # 20% for testing

# Create the training, validation, and test datasets
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size).take(val_size)
test_dataset = dataset.skip(train_size + val_size)

# Prefetching for performance
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-tiny-patch16-224-in21k')

# This will resize and normalize the images using the feature extractor
def preprocess_vit(image, label):
    image = feature_extractor(images=image.numpy(), return_tensors="tf")["pixel_values"][0]
    return image, label

# Apply the preprocessing to the datasets
train_dataset = train_dataset.map(lambda x, y: tf.py_function(func=preprocess_vit, inp=[x, y], Tout=[tf.float32, tf.int32]))
val_dataset = val_dataset.map(lambda x, y: tf.py_function(func=preprocess_vit, inp=[x, y], Tout=[tf.float32, tf.int32]))
test_dataset = test_dataset.map(lambda x, y: tf.py_function(func=preprocess_vit, inp=[x, y], Tout=[tf.float32, tf.int32]))

# Prefetching again for performance
train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)

# Load pretrained ViT model
vit_model = ViTForImageClassification.from_pretrained('google/vit-tiny-patch16-224-in21k', num_labels=len(dataset.class_names))

# Compile the model
vit_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
history = vit_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    batch_size=64
)

# Evaluate the model on the test set
test_loss, test_accuracy = vit_model.evaluate(test_dataset)
print(f'Test accuracy: {test_accuracy:.4f}')