# Sign Language Image Classification with Vision Transformers

In [7]:
import sys
sys.path.append("/home/studio-lab-user/projects/sign-language-image-detection")

In [8]:
# Install required libraries for ViT and other dependencies
!pip install seaborn              
!pip install transformers        
!pip install tensorflow           
!pip install datasets             
!pip install matplotlib           
!pip install Pillow               
!pip install numpy                
!pip install tensorflow-hub      
!pip install scikit-learn       
!pip install tqdm               




## Import necessary packages

In [None]:
# Importing necessary libraries
import sys
import os
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import confusion_matrix, precision_recall_curve
from transformers import ViTForImageClassification, ViTFeatureExtractor

## Load dataset

In [12]:
from PIL import Image

# Load an image to check thaat the dataset is downloaded correctly
img = Image.open("/home/studio-lab-user/projects/sign-language-image-detection/sign_datasets/sign-language-gesture-images-dataset/Gesture Image Data/A/993.jpg")

# Print the image dimensions (width, height)
print(f'Image dimensions: {img.size}')


Image dimensions: (50, 50)


## Dataset Preprocessing  for Vision Transformer (ViT) Model

### **Load Dataset**
   - The dataset is loaded using Keras's `image_dataset_from_directory` function.
   - Images are resized to 224x224 to match the input size of the ViT model.

### **Model and Feature Extractor Loading**:
   - The Vision Transformer model `google/vit-tiny-patch16-224-in21k` is loaded from the `transformers` library.
   - The corresponding feature extractor is also loaded to preprocess images for the model.

### **Preprocessing**:
   - A `preprocess_vit` function applies the feature extractor transformations to normalize and prepare images for the ViT model.
   - The dataset is mapped to this preprocessing function with parallelization for improved performance.

### **Dataset Splitting**:
   - The dataset is split into training (70%), validation (10%), and testing (20%) subsets.


In [None]:

# Dataset path
dataset_path = "/home/studio-lab-user/sign-language-image-detection/sign_datasets/sign-language-gesture-images-dataset/Gesture Image Data"


# Load dataset using Keras
dataset = tf.keras.preprocessing.image_dataset_from_directory(
    dataset_path,
    image_size=(224, 224),  # Resize to 224x224 to match ViT input size
    batch_size=64,
    shuffle=True
)

# Load the ViT feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-tiny-patch16-224-in21k")

# Preprocess the dataset using the feature extractor
def preprocess_vit(image, label):
    # Apply the feature extractor transformations
    image = feature_extractor(images=image.numpy(), return_tensors="tf")["pixel_values"][0]
    return image, label

# Apply preprocessing to the dataset
dataset = dataset.map(preprocess_vit, num_parallel_calls=tf.data.AUTOTUNE)

# Split the dataset into training, validation, and test sets
total_size = len(dataset)  # Ensure dataset length is computed correctly
train_size = int(0.7 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

train_dataset = dataset.take(train_size)
remaining_dataset = dataset.skip(train_size)
val_dataset = remaining_dataset.take(val_size)
test_dataset = remaining_dataset.skip(val_size)

# Prefetching for performance
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

## Train and fine-tune a pretrained Vision Transformer (ViT) model 

- **Load a Pretrained ViT Model:** The pretrained ViT model (`vit-tiny-patch16-224-in21k`) is loaded from hugging face. 

- **Fine-tune:** All layers of the ViT model are set to be trainable. This is done by iterating over the model's parameters and setting `requires_grad = True` for each layer.

- **Compile:** The model is compiled using the Adam optimizer with a learning rate of 1e-3. The loss function is `SparseCategoricalCrossentropy`.

- **Train:** The model is trained using the training dataset (`train_dataset`) and validated using the validation dataset (`val_dataset`) for 2 epochs.

In [None]:
# Load pretrained ViT model
vit_model = ViTForImageClassification.from_pretrained('google/vit-tiny-patch16-224-in21k', num_labels=len(dataset.class_names))


# Fine-tuning: Unfreeze all layers or selected layers
for layer in vit_model.parameters():
    layer.requires_grad = True
    
# Compile the model
vit_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
history = vit_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

## Evaluation on test set

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = vit_model.evaluate(test_dataset)
print(f'Test accuracy: {test_accuracy:.4f}')

## Visualizations and Post processing

### Training history


In [None]:
# Visualize the training history to check for overfitting
def plot_history(history):
    plt.figure(figsize=(12, 6))

    # Plot training and validation accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot training and validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot the training history
plot_history(history)

### Confusion Matrix



In [None]:
# Predict the labels on the test set
y_true = []
y_pred = []

for images, labels in test_dataset:
    y_true.extend(labels.numpy())
    y_pred.extend(np.argmax(vit_model(images, training=False).logits, axis=-1).numpy())

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=dataset.class_names, yticklabels=dataset.class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


### Precision-Recall curve 

In [None]:
y_true_bin = np.array(y_true)
y_pred_prob = np.array([tf.nn.softmax(logits).numpy() for logits in vit_model(images, training=False).logits])

# Generate precision-recall curve for each class
precision, recall, _ = precision_recall_curve(y_true_bin.ravel(), y_pred_prob.ravel())

# Plot Precision-Recall curve
plt.figure(figsize=(10, 8))
plt.plot(recall, precision, color='b', lw=2)
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()
