Import libraries

In [62]:
import os
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
from PIL import Image
import pandas as pd
import cv2
import zipfile
print(torch.__version__)

1.13.1


Setup the environment

In [2]:
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
print('Current directory: ', current_dir)

Current directory:  /Users/oksanaerm/gesture_recognition
Data directory:  /Users/oksanaerm/gesture_recognition/../data/


In [4]:
data_dir = os.path.join(current_dir, 'data/')
train_dir = os.path.join(data_dir, 'train/')
val_dir = os.path.join(data_dir, 'val/')
test_dir = os.path.join(data_dir, 'test/')
print('Data directory: ', data_dir)

Data directory:  /Users/oksanaerm/gesture_recognition/data/


Import data

In [None]:
! pip install kaggle

In [6]:
with open("kaggle.json", "r") as f:
    import json
    api_token = json.load(f)
os.environ['KAGGLE_USERNAME'] = api_token["username"]
os.environ['KAGGLE_KEY'] = api_token["key"]
print('Kaggle API token loaded.')

Kaggle API token loaded.


In [9]:
! kaggle datasets download -d imsparsh/gesture-recognition -p data

# Check for the presence of the dataset file
if any(file.endswith('.zip') for file in data_dir):
    print("Dataset downloaded successfully.")
else:
    print("Dataset not found.")

Downloading gesture-recognition.zip to data
100%|█████████████████████████████████████▉| 1.60G/1.60G [09:15<00:00, 5.82MB/s]
100%|██████████████████████████████████████| 1.60G/1.60G [09:15<00:00, 3.09MB/s]
Dataset not found.


In [12]:
with zipfile.ZipFile(os.path.join(data_dir, 'gesture-recognition.zip'), 'r') as zip_ref:
    zip_ref.extractall(data_dir)
    print("Dataset extracted successfully.")
os.remove(os.path.join(data_dir, 'gesture-recognition.zip'))

Dataset extracted successfully.


In [16]:
def get_transform(image_size: tuple) -> transforms.Compose:
    return transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(image_size),
        transforms.ToTensor()
    ])

In [64]:
class GestureDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.gesture_folders = sorted(
            [folder for folder in os.listdir(self.root_dir)])
        self.class_to_label = {
            'Thumbs_Down': 0, 'Right_Swipe': 1, 'Thumbs_Up': 2, 'Left_Swipe': 3, 'Stop': 4}

    def __len__(self):
        return len(self.gesture_folders)

    def __getitem__(self, idx):
        gesture_path = os.path.join(self.root_dir, self.gesture_folders[idx])

        # Normalize folder name by replacing underscores and converting to lower case
        normalized_folder_name = self.gesture_folders[idx].replace(
            '_', ' ').lower()

        # Try to match normalized folder name with normalized keys from the dictionary
        for gesture_type in self.class_to_label.keys():
            normalized_gesture_type = gesture_type.replace('_', ' ').lower()
            if normalized_gesture_type in normalized_folder_name:
                gesture_class_str = gesture_type
                break
        else:
            raise ValueError(
                f"Unknown gesture type in folder name: {self.gesture_folders[idx]}")

        # Use the dictionary to get the integer label
        gesture_class = self.class_to_label.get(gesture_class_str, -1)

        frames = []
        for img_name in sorted(os.listdir(gesture_path)):
            img_path = os.path.join(gesture_path, img_name)
            image = Image.open(img_path).convert('RGB')  # Convert to RGB
            image = np.array(image)  # Convert to ndarray
            if self.transform:
                image = self.transform(image)
            frames.append(image)

        frames_tensor = torch.stack(frames, dim=0)

        return frames_tensor, gesture_class

In [65]:
BATCH_SIZE = 4
IMAGE_SIZE = (120, 160)
SHUFFLE = True

# Initialize transforms
transform = get_transform(IMAGE_SIZE)

# Initialize train dataset and DataLoader
train_dataset = GestureDataset(
    root_dir=train_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

# Initialize validation dataset and DataLoader
val_dataset = GestureDataset(root_dir=val_dir, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)


Model architecture

In [92]:
class Deep3DCNN(nn.Module):
    def __init__(self, num_classes=5):
        super(Deep3DCNN, self).__init__()

        self.conv_layer = nn.Sequential(
            nn.Conv3d(in_channels=30, out_channels=64,
                      kernel_size=(1, 3, 3), padding=(0, 1, 1)),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),

            nn.Conv3d(in_channels=64, out_channels=128,
                      kernel_size=(1, 3, 3), padding=(0, 1, 1)),
            nn.BatchNorm3d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            nn.Dropout3d(0.2),

            nn.Conv3d(in_channels=128, out_channels=256,
                      kernel_size=(1, 3, 3), padding=(0, 1, 1)),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            nn.Dropout3d(0.2),
        )

        output_size_after_conv = 230400  

        self.fc_layer = nn.Sequential(
            nn.Linear(output_size_after_conv, 2048),
            nn.ReLU(inplace=True),
            nn.Dropout(0.4),
            nn.Linear(2048, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, num_classes))

    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(x.size(0), -1)  # Flattening
        x = self.fc_layer(x)
        return x

In [93]:
# Initialize the model, optimizer, and loss function
num_classes = 5  # Number of gestures to recognize
model = Deep3DCNN(num_classes=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train_dataloader = train_loader
val_dataloader = val_loader

Training pipeline

In [94]:
# Setting hyperparameters
num_epochs = 50
log_interval = 10  # Log training info every 10 batches
val_interval = 1  # Validate the model every 1 epoch

# For storing training history
train_losses = []
val_losses = []

# Move model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()

# Training Loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for i, (videos, labels) in enumerate(train_dataloader):
        videos, labels = Variable(videos), Variable(labels)

        if torch.cuda.is_available():
            videos = videos.cuda()
            labels = labels.cuda()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward
        outputs = model(videos)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % log_interval == (log_interval - 1):
            print(
                f"[Epoch {epoch + 1}, Iter {i + 1}] loss: {running_loss / log_interval:.3f}")
            train_losses.append(running_loss / log_interval)
            running_loss = 0.0

    # Validation loop
    if epoch % val_interval == 0:
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for videos, labels in val_dataloader:
                videos, labels = Variable(videos), Variable(labels)

                if torch.cuda.is_available():
                    videos = videos.cuda()
                    labels = labels.cuda()

                outputs = model(videos)
                loss = criterion(outputs, labels)

                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_dataloader)
        val_losses.append(val_loss)
        accuracy = 100 * correct / total

        print(
            f"[Validation] Epoch {epoch + 1} Loss: {val_loss:.3f} | Accuracy: {accuracy}%")

        # Save the model checkpoint
        torch.save(model.state_dict(), f'gesture_recog_epoch_{epoch+1}.pth')

torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
[Epoch 1, Iter 10] loss: 117.602
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
[Epoch 1, Iter 20] loss: 52.722
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.Size([4, 256, 3, 15, 20])
torch.S

KeyboardInterrupt: 

Test pipeline

In [None]:
transform = transforms.Compose([
    transforms.Resize((120, 160)),
    transforms.ToTensor(),
])

In [None]:
# Initialize the camera
camera = cv2.VideoCapture(0)

# Initialize the model and set it to evaluation mode
model = Deep3DCNN(num_classes=5)
model.eval()

frame_count = 0
recent_frames = []

In [None]:
while True:
    # Capture frame-by-frame
    ret, frame = camera.read()

    if not ret:
        print("Failed to grab frame")
        break

    # Convert the captured frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert the numpy array to PIL Image
    rgb_frame = Image.fromarray(rgb_frame)

    # Apply transformations
    transformed_frame = transform(rgb_frame)

    # Increment frame counter and append the frame to recent_frames
    frame_count += 1
    recent_frames.append(transformed_frame)

    # Once 30 frames are accumulated, make the prediction
    if frame_count == 30:
        # Convert the list of 30 frames to a tensor
        test_sample = torch.stack(recent_frames, dim=0).unsqueeze(
            0)  # Shape: [1, 30, 3, 120, 160]
        # Make a prediction
        with torch.no_grad():
            output = model(test_sample)
            _, prediction = torch.max(output.data, 1)

        print("Predicted Gesture:", prediction.item())

        frame_count = 0
        recent_frames = []

    # Display the frame
    cv2.imshow("Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

In [None]:
#Clean up
camera.release()
cv2.destroyAllWindows()