In [8]:
# Core Python Libraries
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt

# MediaPipe for hand tracking
import mediapipe as mp

# PyTorch for deep learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Text-to-Speech
from gtts import gTTS

# Jupyter display for audio
import IPython.display as ipd

# Utilities
from tqdm import tqdm
import pandas as pd


In [9]:
def extract_hand_landmarks(image_path):
    """
    Extracts 21 hand landmarks (x, y, z) from an image using MediaPipe.
    Returns a flat numpy array of shape (63,) or None if no hand is detected.
    """
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return None
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    hands.close()
    if results.multi_hand_landmarks:
        landmarks = []
        for lm in results.multi_hand_landmarks[0].landmark:
            landmarks.extend([lm.x, lm.y, lm.z])
        return np.array(landmarks)
    else:
        return None


In [15]:
sample_image_path = '../data/indian_sign_language/Indian/2/0.jpg'  # Update path as needed
landmarks = extract_hand_landmarks(sample_image_path)
print("Landmarks shape:", landmarks.shape if landmarks is not None else "No hand detected")
print("Landmarks:", landmarks)

Landmarks shape: (63,)
Landmarks: [ 4.64867830e-01  8.80217552e-01  1.02489582e-06  3.71870458e-01
  8.01113844e-01 -8.77911672e-02  3.57219130e-01  6.71855748e-01
 -1.33669019e-01  4.62136000e-01  5.76732516e-01 -1.76605433e-01
  5.60061991e-01  5.09061694e-01 -2.15091363e-01  3.70272011e-01
  4.61181670e-01 -6.88067228e-02  3.37009907e-01  2.83874810e-01
 -1.28876448e-01  3.21013629e-01  1.70809790e-01 -1.70739815e-01
  3.10772389e-01  7.96734393e-02 -1.99820369e-01  4.76568371e-01
  4.64770943e-01 -7.29502887e-02  5.14914870e-01  2.69875973e-01
 -1.33014694e-01  5.32648325e-01  1.52956322e-01 -1.80291295e-01
  5.48952460e-01  5.55894375e-02 -2.09491700e-01  5.57952702e-01
  5.20417094e-01 -8.78244787e-02  6.14293694e-01  4.01765853e-01
 -1.78209692e-01  5.65443397e-01  4.80156362e-01 -2.05650017e-01
  5.29094040e-01  5.44319272e-01 -2.02927113e-01  6.19916260e-01
  6.02885008e-01 -1.08538456e-01  6.37440145e-01  5.20595968e-01
 -1.84245676e-01  5.83595812e-01  5.74259400e-01 -1.9314

In [10]:
import glob

def process_dataset(dataset_root):
    """
    Processes all images in the dataset, extracts landmarks, and saves features and labels as .npy files.
    """
    features = []
    labels = []
    class_names = sorted(os.listdir(dataset_root))
    class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}
    print("Class mapping:", class_to_idx)
    
    for cls_name in class_names:
        cls_folder = os.path.join(dataset_root, cls_name)
        image_paths = glob.glob(os.path.join(cls_folder, "*.jpg"))
        print(f"Processing {cls_name} ({len(image_paths)} images)")
        success_count = 0
        fail_count = 0
        for img_path in tqdm(image_paths):
            lm = extract_hand_landmarks(img_path)
            if lm is not None:
                features.append(lm)
                labels.append(class_to_idx[cls_name])
                success_count += 1
            else:
                fail_count += 1
                if fail_count < 5:  # Print first few failures
                    print(f"Failed to detect hand in: {img_path}")
        print(f"Success: {success_count}, Failed: {fail_count}")
    
    features = np.array(features)
    labels = np.array(labels)
    print(f"Total samples: {features.shape[0]}")
    # Save features and labels
    np.save(os.path.join(dataset_root, "features.npy"), features)
    np.save(os.path.join(dataset_root, "labels.npy"), labels)
    print("Saved features and labels as .npy files.")
    return class_to_idx
    # Add this to your process_dataset function to debug






In [None]:
# Example usage for Indian Sign Language dataset: Already done
dataset_root = "../data/indian_sign_language/Indian"  # Adjust path if needed
class_to_idx = process_dataset(dataset_root)

In [11]:
# Process the gestures_dataset folder
gestures_dataset_root = "../data/gestures_dataset"  # Adjust path if needed
print("Processing custom gestures dataset...")
gestures_class_to_idx = process_dataset(gestures_dataset_root)


Processing custom gestures dataset...
Class mapping: {'Bye': 0, 'Hello': 1, 'No': 2, 'Perfect': 3, 'Thank You': 4, 'Yes': 5}
Processing Bye (400 images)


  0%|▏                                                                                 | 1/400 [00:00<00:40,  9.74it/s]

Failed to detect hand in: ../data/gestures_dataset\Bye\Image_1667239052.2083564.jpg
Failed to detect hand in: ../data/gestures_dataset\Bye\Image_1667239052.8112261.jpg


  1%|▌                                                                                 | 3/400 [00:00<00:38, 10.23it/s]

Failed to detect hand in: ../data/gestures_dataset\Bye\Image_1667239052.8485913.jpg
Failed to detect hand in: ../data/gestures_dataset\Bye\Image_1667239052.882727.jpg


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:55<00:00,  7.16it/s]


Success: 0, Failed: 400
Processing Hello (400 images)


  1%|▌                                                                                 | 3/400 [00:00<00:59,  6.67it/s]

Failed to detect hand in: ../data/gestures_dataset\Hello\Image_1667238913.6247861.jpg
Failed to detect hand in: ../data/gestures_dataset\Hello\Image_1667238913.6402302.jpg


  1%|█                                                                                 | 5/400 [00:00<00:56,  7.03it/s]

Failed to detect hand in: ../data/gestures_dataset\Hello\Image_1667238913.6956959.jpg
Failed to detect hand in: ../data/gestures_dataset\Hello\Image_1667238913.715463.jpg


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:53<00:00,  7.54it/s]


Success: 7, Failed: 393
Processing No (400 images)


  0%|▏                                                                                 | 1/400 [00:00<00:51,  7.74it/s]

Failed to detect hand in: ../data/gestures_dataset\No\Image_1667239349.9046743.jpg


  0%|▍                                                                                 | 2/400 [00:00<01:03,  6.30it/s]

Failed to detect hand in: ../data/gestures_dataset\No\Image_1667239350.5289142.jpg


  1%|▌                                                                                 | 3/400 [00:00<00:59,  6.71it/s]

Failed to detect hand in: ../data/gestures_dataset\No\Image_1667239350.576339.jpg


  1%|▊                                                                                 | 4/400 [00:00<00:55,  7.11it/s]

Failed to detect hand in: ../data/gestures_dataset\No\Image_1667239350.6674688.jpg


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:57<00:00,  6.99it/s]


Success: 0, Failed: 400
Processing Perfect (400 images)


  0%|▏                                                                                 | 1/400 [00:00<01:03,  6.29it/s]

Failed to detect hand in: ../data/gestures_dataset\Perfect\Image_1667239499.214726.jpg


  1%|▌                                                                                 | 3/400 [00:00<01:07,  5.92it/s]

Failed to detect hand in: ../data/gestures_dataset\Perfect\Image_1667239499.8150597.jpg
Failed to detect hand in: ../data/gestures_dataset\Perfect\Image_1667239499.8476486.jpg


  1%|█                                                                                 | 5/400 [00:00<00:57,  6.84it/s]

Failed to detect hand in: ../data/gestures_dataset\Perfect\Image_1667239499.8863125.jpg


 12%|█████████▋                                                                       | 48/400 [00:06<00:50,  6.91it/s]


KeyboardInterrupt: 

In [22]:
def reshape_landmarks_for_cnn(landmarks, grid_size=7):
    """
    Reshape the 1D landmark array (63 values) into a 2D grid format suitable for CNN processing.
    
    Args:
        landmarks: numpy array of shape (63,) containing x,y,z coordinates of 21 hand landmarks
        grid_size: size of the grid to reshape landmarks into (default: 7x7)
    
    Returns:
        3-channel image-like representation of landmarks with shape (3, grid_size, grid_size)
    """
    # Reshape the 63 values (21 landmarks x 3 coordinates) into 3 channels
    x_coords = landmarks[0::3]  # x coordinates
    y_coords = landmarks[1::3]  # y coordinates
    z_coords = landmarks[2::3]  # z coordinates
    
    # Create a spatial representation by placing landmarks in a grid
    x_channel = np.zeros((grid_size, grid_size))
    y_channel = np.zeros((grid_size, grid_size))
    z_channel = np.zeros((grid_size, grid_size))
    
    # Map the 21 landmarks to positions in the grid
    for i in range(21):
        # Scale coordinates to grid indices
        x_idx = min(int(x_coords[i] * (grid_size-1)), grid_size-1)
        y_idx = min(int(y_coords[i] * (grid_size-1)), grid_size-1)
        
        # Set the values in the grid
        x_channel[y_idx, x_idx] = x_coords[i]
        y_channel[y_idx, x_idx] = y_coords[i]
        z_channel[y_idx, x_idx] = z_coords[i]
    
    # Stack channels to create a 3-channel representation
    grid_representation = np.stack([x_channel, y_channel, z_channel], axis=0)
    return grid_representation



In [23]:
# Example usage:
# Reshape a single landmark sample
sample_landmark = np.load("../data/indian_sign_language/Indian/features.npy")[0]
reshaped_sample = reshape_landmarks_for_cnn(sample_landmark)
print("Original shape:", sample_landmark.shape)
print("Reshaped for CNN:", reshaped_sample.shape)


Original shape: (63,)
Reshaped for CNN: (3, 7, 7)


In [24]:
# Create a function to preprocess the entire dataset
def preprocess_dataset_for_cnn(features_path, labels_path, grid_size=7):
    """
    Preprocess the entire dataset for CNN training.
    
    Args:
        features_path: path to the features.npy file
        labels_path: path to the labels.npy file
        grid_size: size of the grid for reshaping
        
    Returns:
        X: reshaped features suitable for CNN
        y: labels
    """
    features = np.load(features_path)
    labels = np.load(labels_path)
    
    # Reshape all samples
    X = np.zeros((len(features), 3, grid_size, grid_size))
    for i in range(len(features)):
        X[i] = reshape_landmarks_for_cnn(features[i], grid_size)
    
    return X, labels


In [25]:
class GestureCNN(nn.Module):
    def __init__(self, num_classes, grid_size=7):
        super(GestureCNN, self).__init__()
        
        # Convolutional layers
        self.conv_layers = nn.Sequential(
            # First convolutional block
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Second convolutional block
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Third convolutional block
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # Calculate the size after convolutions and pooling
        # For a 7x7 input with 3 max pooling layers (each dividing by 2), the output size is 7/(2^3) ≈ 1
        conv_output_size = max(1, grid_size // (2**3))
        
        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * conv_output_size * conv_output_size, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x


In [26]:
# Create a CNN dataset class
class CNNLandmarkDataset(Dataset):
    """Dataset for CNN-ready hand landmark features and labels."""
    def __init__(self, features_path, labels_path, grid_size=7, transform=None):
        features = np.load(features_path)
        self.labels = np.load(labels_path)
        
        # Reshape features for CNN
        self.features = np.zeros((len(features), 3, grid_size, grid_size))
        for i in range(len(features)):
            self.features[i] = reshape_landmarks_for_cnn(features[i], grid_size)
            
        self.transform = transform
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        
        if self.transform:
            feature = self.transform(feature)
            
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [27]:
# Example usage:
# Load the dataset
grid_size = 7  # Size of the grid for reshaping landmarks
isl_features_path = "../data/indian_sign_language/Indian/features.npy"
isl_labels_path = "../data/indian_sign_language/Indian/labels.npy"

In [28]:
# Create CNN dataset
isl_cnn_dataset = CNNLandmarkDataset(isl_features_path, isl_labels_path, grid_size)

# Check dataset
print(f"CNN Dataset size: {len(isl_cnn_dataset)}")
print(f"Feature shape: {isl_cnn_dataset[0][0].shape}")
print(f"Number of classes: {len(np.unique(np.load(isl_labels_path)))}")

# Initialize the CNN model
num_classes = len(np.unique(np.load(isl_labels_path)))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

cnn_model = GestureCNN(num_classes=num_classes, grid_size=grid_size)
cnn_model = cnn_model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

print(cnn_model)

CNN Dataset size: 41684
Feature shape: torch.Size([3, 7, 7])
Number of classes: 35
Using device: cuda
GestureCNN(
  (conv_layers): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Flatten(start_dim=1, e

In [30]:
# Create a directory for saving if it doesn't exist
os.makedirs("../checkpoints", exist_ok=True)

# Save the current state of your model and preprocessing
torch.save({
    'model_state_dict': cnn_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'grid_size': grid_size,
    'num_classes': num_classes
}, "../checkpoints/model_checkpoint.pth")

# You can also save any important variables
import pickle
with open("../checkpoints/variables.pkl", "wb") as f:
    pickle.dump({
        "class_to_idx": class_to_idx,
        "gestures_class_to_idx": gestures_class_to_idx
        # Add any other variables you want to save
    }, f)


In [31]:
# Create directory for checkpoints
os.makedirs("../checkpoints", exist_ok=True)

# Save important variables and mappings
checkpoint_data = {
    'isl_class_to_idx': class_to_idx,  # Rename for clarity
    'gestures_class_to_idx': gestures_class_to_idx,  # Add this line
    'dataset_processed': True,
    'last_completed_step': 'dataset_processing'
}


# Save the checkpoint
import pickle
with open("../checkpoints/progress_checkpoint.pkl", "wb") as f:
    pickle.dump(checkpoint_data, f)

print("Progress saved successfully! You can continue from this point tomorrow.")


Progress saved successfully! You can continue from this point tomorrow.


In [1]:
# Import necessary libraries
import os
import pickle
import numpy as np
import torch

# Check if checkpoint exists and load it
checkpoint_path = "../checkpoints/progress_checkpoint.pkl"
if os.path.exists(checkpoint_path):
    with open(checkpoint_path, "rb") as f:
        checkpoint_data = pickle.load(f)
    
    # Restore variables
    isl_class_to_idx = checkpoint_data.get('isl_class_to_idx')
    gestures_class_to_idx = checkpoint_data.get('gestures_class_to_idx')
    
    print("Previous session state restored successfully!")
    print(f"Indian Sign Language classes: {len(isl_class_to_idx) if isl_class_to_idx else 0}")
    print(f"Custom Gestures classes: {len(gestures_class_to_idx) if gestures_class_to_idx else 0}")
else:
    print("No checkpoint found. You'll need to rerun the preprocessing steps.")

# Verify that the processed data files exist
isl_features_path = "../data/indian_sign_language/Indian/features.npy"
isl_labels_path = "../data/indian_sign_language/Indian/labels.npy"
gestures_features_path = "../data/gestures_dataset/features.npy"
gestures_labels_path = "../data/gestures_dataset/labels.npy"

# Check if files exist
isl_data_exists = os.path.exists(isl_features_path) and os.path.exists(isl_labels_path)
gestures_data_exists = os.path.exists(gestures_features_path) and os.path.exists(gestures_labels_path)

print(f"Indian Sign Language processed data available: {isl_data_exists}")
print(f"Custom Gestures processed data available: {gestures_data_exists}")

# If data exists, load sample to verify
if isl_data_exists:
    # Load a small sample to verify data integrity
    features_sample = np.load(isl_features_path)[:5]
    labels_sample = np.load(isl_labels_path)[:5]
    print(f"ISL Features shape: {features_sample.shape}")
    print(f"ISL Labels sample: {labels_sample}")


Previous session state restored successfully!
Indian Sign Language classes: 35
Custom Gestures classes: 6
Indian Sign Language processed data available: True
Custom Gestures processed data available: True
ISL Features shape: (5, 63)
ISL Labels sample: [0 0 0 0 0]


In [2]:
# Load both preprocessed datasets and combine them
def combine_preprocessed_datasets():
    # Load Indian Sign Language dataset
    isl_features_path = "../data/indian_sign_language/Indian/features.npy"
    isl_labels_path = "../data/indian_sign_language/Indian/labels.npy"
    isl_features = np.load(isl_features_path)
    isl_labels = np.load(isl_labels_path)
    
    # Load custom gestures dataset
    gestures_features_path = "../data/gestures_dataset/features.npy"
    gestures_labels_path = "../data/gestures_dataset/labels.npy"
    gestures_features = np.load(gestures_features_path)
    gestures_labels = np.load(gestures_labels_path)
    
    # Load class mappings
    with open("../checkpoints/progress_checkpoint.pkl", "rb") as f:
        checkpoint_data = pickle.load(f)
    
    isl_class_to_idx = checkpoint_data['isl_class_to_idx']
    gestures_class_to_idx = checkpoint_data['gestures_class_to_idx']
    
    # Create a unified class mapping
    # First, include all ISL classes
    unified_class_to_idx = isl_class_to_idx.copy()
    
    # Then add gesture classes with offset
    offset = len(isl_class_to_idx)
    for gesture_class, idx in gestures_class_to_idx.items():
        unified_class_to_idx[f"gesture_{gesture_class}"] = idx + offset
    
    # Adjust gesture labels to avoid overlap with ISL labels
    adjusted_gestures_labels = gestures_labels + offset
    
    # Combine features and adjusted labels
    combined_features = np.vstack((isl_features, gestures_features))
    combined_labels = np.concatenate((isl_labels, adjusted_gestures_labels))
    
    print(f"ISL dataset: {len(isl_labels)} samples")
    print(f"Gestures dataset: {len(gestures_labels)} samples")
    print(f"Combined dataset: {len(combined_labels)} samples")
    print(f"Total number of classes: {len(unified_class_to_idx)}")
    
    # Save the combined dataset
    os.makedirs("../data/combined_dataset", exist_ok=True)
    np.save("../data/combined_dataset/features.npy", combined_features)
    np.save("../data/combined_dataset/labels.npy", combined_labels)
    
    # Save the unified class mapping
    with open("../data/combined_dataset/class_mapping.pkl", "wb") as f:
        pickle.dump(unified_class_to_idx, f)
    
    return combined_features, combined_labels, unified_class_to_idx

# Execute the function to combine datasets
combined_features, combined_labels, unified_class_to_idx = combine_preprocessed_datasets()


ISL dataset: 41684 samples
Gestures dataset: 7 samples
Combined dataset: 41691 samples
Total number of classes: 41
