In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torch
from torch.utils.data import Dataset, DataLoader
import json
import numpy as np
from torchvision import transforms



In [None]:
gestures = [
    'pointing',
    'open_palm',
    'thumb_index_touch',
    'thumb_middle_touch',
    'fist'
]

In [2]:
class GestureCNNLSTM(nn.Module):
    def __init__(self, num_classes, hidden_size=128, num_layers=2):
        super(GestureCNNLSTM, self).__init__()
        
        # CNN for spatial feature extraction
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),  # Input: 3x224x224
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Output: 32x112x112
            
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Output: 64x56x56
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Output: 128x28x28
            
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Output: 256x14x14
        )
        
        # Calculate the size of CNN output
        self.feature_size = 256 * 14 * 14
        
        # LSTM for temporal feature extraction
        self.lstm = nn.LSTM(
            input_size=self.feature_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        
        # Fully connected layer for classification
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        batch_size, seq_len, C, H, W = x.size()
        
        # Reshape to process each frame individually through CNN
        x = x.view(batch_size * seq_len, C, H, W)
        x = self.cnn(x)  # Output: (batch_size * seq_len, 256, 14, 14)
        x = x.view(batch_size, seq_len, -1)  # Output: (batch_size, seq_len, 256*14*14)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(x)  # lstm_out: (batch_size, seq_len, hidden_size*2)
        
        # Take the output from the last time step
        last_time_step = lstm_out[:, -1, :]  # (batch_size, hidden_size*2)
        
        # Classification
        out = self.fc(last_time_step)  # (batch_size, num_classes)
        return out


In [3]:


class GestureDataset(Dataset):
    def __init__(self, root_dir, sequence_length=30, transform=None):
        """
        Args:
            root_dir (string): Directory with all the gesture subdirectories.
            sequence_length (int): Number of frames per sequence.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.root_dir = root_dir
        self.sequence_length = sequence_length
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}
        self.samples = self.make_dataset()
    
    def make_dataset(self):
        samples = []
        for cls in self.classes:
            cls_dir = os.path.join(self.root_dir, cls)
            if not os.path.isdir(cls_dir):
                continue
            json_files = sorted([f for f in os.listdir(cls_dir) if f.endswith('.json')])
            for i in range(len(json_files) - self.sequence_length + 1):
                seq = json_files[i:i + self.sequence_length]
                seq_paths = [os.path.join(cls_dir, f) for f in seq]
                samples.append((seq_paths, self.class_to_idx[cls]))
        return samples
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        seq_paths, label = self.samples[idx]
        sequence = []
        for img_path in seq_paths:
            with open(img_path, 'r') as f:
                landmarks = json.load(f)
                # Flatten landmarks: list of dicts to a flat list
                flat_landmarks = []
                for lm in landmarks:
                    flat_landmarks.extend([lm['x'], lm['y'], lm['z']])
                flat_landmarks = np.array(flat_landmarks, dtype=np.float32)
                sequence.append(flat_landmarks)
        
        sequence = np.array(sequence)  # Shape: (sequence_length, 63)
        if self.transform:
            sequence = self.transform(sequence)
        else:
            # Normalize landmarks
            sequence = (sequence - np.mean(sequence, axis=0)) / np.std(sequence, axis=0)
            sequence = torch.FloatTensor(sequence)
        
        return sequence, label


In [4]:
class LandmarkTransform:
    def __call__(self, sample):
        # sample shape: (sequence_length, 63)
        # Example: Add random Gaussian noise
        noise = np.random.normal(0, 0.01, sample.shape)
        sample = sample + noise
        # Normalize
        sample = (sample - np.mean(sample, axis=0)) / np.std(sample, axis=0)
        return torch.FloatTensor(sample)

data_transforms = transforms.Compose([
    LandmarkTransform()
])


In [None]:
# Paths to dataset directories
train_dir = 'gesture_dataset/train'
val_dir = 'gesture_dataset/val'
test_dir = 'gesture_dataset/test'

# Create datasets
train_dataset = GestureDataset(root_dir=train_dir, sequence_length=30, transform=data_transforms)
val_dataset = GestureDataset(root_dir=val_dir, sequence_length=30, transform=data_transforms)
test_dataset = GestureDataset(root_dir=test_dir, sequence_length=30, transform=data_transforms)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)


In [None]:
# Number of gesture classes
num_classes = len(train_dataset.classes)

# Instantiate the model
model = GestureCNNLSTM(num_classes=num_classes, hidden_size=128, num_layers=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
import time
from tqdm import tqdm

num_epochs = 50
best_val_acc = 0.0

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    loop = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for sequences, labels in loop:
        sequences = sequences.to(device)  # Shape: (batch_size, seq_len, 63)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item() * sequences.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        loop.set_postfix(loss=loss.item(), accuracy=100 * correct / total)
    
    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = 100 * correct / total
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for sequences, labels in val_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item() * sequences.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    
    val_epoch_loss = val_loss / len(val_dataset)
    val_epoch_acc = 100 * val_correct / val_total
    
    print(f'Epoch [{epoch+1}/{num_epochs}] '
          f'Train Loss: {epoch_loss:.4f} Train Acc: {epoch_acc:.2f}% '
          f'Val Loss: {val_epoch_loss:.4f} Val Acc: {val_epoch_acc:.2f}%')
    
    # Save the best model
    if val_epoch_acc > best_val_acc:
        best_val_acc = val_epoch_acc
        torch.save(model.state_dict(), 'best_gesture_model.pth')
        print(f'Best model saved with Val Acc: {best_val_acc:.2f}%')


<h1><strong>TEST

In [None]:
# Load the best model
model.load_state_dict(torch.load('best_gesture_model.pth'))
model.to(device)
model.eval()

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for sequences, labels in test_loader:
        sequences = sequences.to(device)
        labels = labels.to(device)
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        
        test_loss += loss.item() * sequences.size(0)
        _, predicted = torch.max(outputs.data, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_epoch_loss = test_loss / len(test_dataset)
test_epoch_acc = 100 * test_correct / test_total

print(f'Test Loss: {test_epoch_loss:.4f} Test Acc: {test_epoch_acc:.2f}%')


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# After predictions on test set
all_preds = []
all_labels = []

with torch.no_grad():
    for sequences, labels in test_loader:
        sequences = sequences.to(device)
        labels = labels.to(device)
        outputs = model(sequences)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=gestures, yticklabels=gestures, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print(classification_report(all_labels, all_preds, target_names=gestures))


In [None]:
# import cv2
# import mediapipe as mp
# import os
# import time
# import json

# # Initialize MediaPipe Hands
# mp_hands = mp.solutions.hands
# hands = mp_hands.Hands(
#     static_image_mode=False,
#     max_num_hands=1,
#     min_detection_confidence=0.7,
#     min_tracking_confidence=0.7
# )
# mp_drawing = mp.solutions.drawing_utils

# # Define gestures
# gestures = [
#     'swipe_up',
#     'swipe_down',
#     'swipe_left',
#     'swipe_right',
#     'pointing',
#     'open_palm',
#     'thumb_index_touch',
#     'thumb_middle_touch',
#     'fist'
# ]

# # Create directories for each gesture
# data_dir = 'gesture_dataset'
# for gesture in gestures:
#     os.makedirs(os.path.join(data_dir, 'train', gesture), exist_ok=True)
#     os.makedirs(os.path.join(data_dir, 'val', gesture), exist_ok=True)

# # Function to save landmarks
# def save_landmarks(gesture, landmarks, seq_num, phase='train'):
#     gesture_dir = os.path.join(data_dir, phase, gesture)
#     filename = f'{gesture}_{seq_num}.json'
#     filepath = os.path.join(gesture_dir, filename)
#     with open(filepath, 'w') as f:
#         json.dump(landmarks, f)

# # Start video capture
# cap = cv2.VideoCapture(0)
# gesture = None
# seq_num = {g: 0 for g in gestures}

# print("Press the corresponding number key to start recording a gesture:")
# for idx, g in enumerate(gestures):
#     print(f"{idx}: {g}")
# print("Press 'q' to quit.")

# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break

#     # Flip the frame for mirror effect
#     frame = cv2.flip(frame, 1)
#     image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#     results = hands.process(image)

#     # Draw hand landmarks
#     image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
#     if results.multi_hand_landmarks:
#         for hand_landmarks in results.multi_hand_landmarks:
#             mp_drawing.draw_landmarks(
#                 image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

#     cv2.imshow('Data Collection', image)

#     key = cv2.waitKey(1) & 0xFF

#     if key == ord('q'):
#         break
#     elif key in [ord(str(i)) for i in range(len(gestures))]:
#         gesture = gestures[key - ord('0')]
#         print(f"Recording gesture: {gesture}")
#         time.sleep(1)  # Brief pause before recording
#         # Record for 1 second (assuming ~30 FPS)
#         for _ in range(30):
#             ret, frame = cap.read()
#             if not ret:
#                 break
#             frame = cv2.flip(frame, 1)
#             image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             results = hands.process(image)

#             if results.multi_hand_landmarks:
#                 for hand_landmarks in results.multi_hand_landmarks:
#                     landmarks = []
#                     for lm in hand_landmarks.landmark:
#                         landmarks.append({
#                             'x': lm.x,
#                             'y': lm.y,
#                             'z': lm.z
#                         })
#                     save_landmarks(gesture, landmarks, seq_num[gesture], phase='train')
#                     seq_num[gesture] += 1

#             # Optionally display progress
#             cv2.imshow('Data Collection', frame)
#             if cv2.waitKey(1) & 0xFF == ord('q'):
#                 break

#         print(f"Finished recording gesture: {gesture}")

# cap.release()
# cv2.destroyAllWindows()


In [None]:
import cv2
import torch
import numpy as np
from collections import deque
from torchvision import transforms

# Initialize MediaPipe Hands
import mediapipe as mp

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)
mp_drawing = mp.solutions.drawing_utils

# Load the trained model
model = GestureCNNLSTM(num_classes=num_classes)
model.load_state_dict(torch.load('best_gesture_model.pth'))
model.to(device)
model.eval()

# Parameters
sequence_length = 30
buffer = deque(maxlen=sequence_length)
transform = transforms.Compose([
    LandmarkTransform()  # Assuming LandmarkTransform is defined as before
])

# Start video capture
cap = cv2.VideoCapture(3)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame for a mirror-like effect
    frame = cv2.flip(frame, 1)
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(image)

    gesture_label = None

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw landmarks on the frame
            mp_drawing.draw_landmarks(
                frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Extract and flatten landmarks
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            landmarks = np.array(landmarks, dtype=np.float32)
            landmarks = (landmarks - landmarks.mean()) / (landmarks.std() + 1e-6)
            landmarks = torch.FloatTensor(landmarks)

            # Add to buffer
            buffer.append(landmarks)

            if len(buffer) == sequence_length:
                # Prepare sequence
                sequence = torch.stack(list(buffer)).unsqueeze(0).to(device)  # Shape: (1, seq_len, 63)

                # Predict gesture
                with torch.no_grad():
                    output = model(sequence)
                    _, predicted = torch.max(output.data, 1)
                    gesture_label = train_dataset.classes[predicted.item()]

    # Display gesture label
    if gesture_label:
        cv2.putText(frame, f'Gesture: {gesture_label}', (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    cv2.imshow('Gesture Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
