#### Import libraries

In [None]:
import os
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
from PIL import Image
import pandas as pd
import cv2
import zipfile
print(torch.__version__)

#### Setup the environment

In [None]:
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
print('Current directory: ', current_dir)

In [None]:
data_dir = os.path.join(current_dir, 'data/')
train_dir = os.path.join(data_dir, 'train/')
val_dir = os.path.join(data_dir, 'val/')
test_dir = os.path.join(data_dir, 'test/')
print('Data directory: ', data_dir)

#### Load the data

In [None]:
! pip install kaggle

In [None]:
with open("kaggle.json", "r") as f:
    import json
    api_token = json.load(f)
os.environ['KAGGLE_USERNAME'] = api_token["username"]
os.environ['KAGGLE_KEY'] = api_token["key"]
print('Kaggle API token loaded.')

In [None]:
! kaggle datasets download -d imsparsh/gesture-recognition -p data

# Check for the presence of the dataset file
if any(file.endswith('.zip') for file in data_dir):
    print("Dataset downloaded successfully.")
else:
    print("Dataset not found.")

In [None]:
with zipfile.ZipFile(os.path.join(data_dir, 'gesture-recognition.zip'), 'r') as zip_ref:
    zip_ref.extractall(data_dir)
    print("Dataset extracted successfully.")
os.remove(os.path.join(data_dir, 'gesture-recognition.zip'))

### 1. Data preparation

In [38]:
def get_transform(image_size: tuple) -> transforms.Compose:
    return transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(image_size),
        transforms.ToTensor()
    ])

In [39]:
class GestureDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.gesture_folders = sorted(
            [folder for folder in os.listdir(self.root_dir)])
        self.class_to_label = {
            'Thumbs_Down': 0, 
            'Right_Swipe': 1, 
            'Thumbs_Up': 2, 
            'Left_Swipe': 3, 
            'Stop': 4}
        self.validate_folders_and_labels()
    
    def validate_folders_and_labels(self):
        # Loop through each folder in the root directory and check if it matches any label
        unmatched_folders = []
        for folder in self.gesture_folders:
            normalized_folder_name = folder.replace('_', ' ').lower()
            match_found = False
            for gesture_type in self.class_to_label.keys():
                normalized_gesture_type = gesture_type.replace('_', ' ').lower()
                if normalized_gesture_type in normalized_folder_name:
                    match_found = True
                    break
            if not match_found:
                unmatched_folders.append(folder)

        # If there are unmatched folders, raise a warning or error
        if unmatched_folders:
            warning_msg = f"Warning: Unmatched folders found that don't correspond to any label: {unmatched_folders}"
            print(warning_msg)

        # Check if there are unused labels in the dictionary
        unused_labels = set(self.class_to_label.keys())
        for folder in self.gesture_folders:
            normalized_folder_name = folder.replace('_', ' ').lower()
            for gesture_type in self.class_to_label.keys():
                normalized_gesture_type = gesture_type.replace('_', ' ').lower()
                if normalized_gesture_type in normalized_folder_name:
                    unused_labels.discard(gesture_type)

        if unused_labels:
            warning_msg = f"Warning: Unused labels found that don't correspond to any folder: {list(unused_labels)}"
            print(warning_msg)

    def __len__(self):
        return len(self.gesture_folders)

    def __getitem__(self, idx):
        gesture_path = os.path.join(self.root_dir, self.gesture_folders[idx])
        # Normalize folder name by replacing underscores and converting to lower case
        normalized_folder_name = self.gesture_folders[idx].replace(
            '_', ' ').lower()
        # Try to match normalized folder name with normalized keys from the dictionary
        for gesture_type in self.class_to_label.keys():
            normalized_gesture_type = gesture_type.replace('_', ' ').lower()
            if normalized_gesture_type in normalized_folder_name:
                gesture_class_str = gesture_type
                break
        else:
            raise ValueError(
                f"Unknown gesture type in folder name: {self.gesture_folders[idx]}"
                f"Recognized types are: {', '.join(self.class_to_label.keys())}")

        # Use the dictionary to get the integer label
        gesture_class = self.class_to_label.get(gesture_class_str, -1)

        frames = []
        for img_name in sorted(os.listdir(gesture_path)):
            img_path = os.path.join(gesture_path, img_name)
            image = Image.open(img_path).convert('RGB') 
            image = np.array(image) 
            if self.transform:
                image = self.transform(image)
            frames.append(image)

        frames_tensor = torch.stack(frames, dim=0)
        return frames_tensor, gesture_class

In [40]:
BATCH_SIZE = 4
IMAGE_SIZE = (120, 160)
SHUFFLE = True

# Initialize transforms
transform = get_transform(IMAGE_SIZE)

# Initialize train dataset and DataLoader
train_dataset = GestureDataset(
    root_dir=train_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

# Initialize validation dataset and DataLoader
val_dataset = GestureDataset(root_dir=val_dir, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)


### 2. Model architecture

In [41]:
import torch.nn.init as init

# Initialize weights
class SEBlock(nn.Module):  # to recalibrate the feature maps
    def __init__(self, in_channels, reduction=16):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool3d(1)
        self.fc = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction, in_channels, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1, 1)
        return x * y.expand_as(x)
# This could be quite useful for recognizing gestures where some features 
# (like the shape and movement of the hand) are more critical than others 
# (like the background).

# Define the model
class Deep3DCNN(nn.Module):
    def __init__(self, num_classes, reduction=16, dropout_rate=0.2):
        super(Deep3DCNN, self).__init__()
        
        # Convolutional layers
        self.conv_layer = nn.Sequential(
            nn.Conv3d(in_channels=30, out_channels=64, kernel_size=(1, 3, 3), padding=(0, 1, 1), groups=1),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
            SEBlock(64, reduction=reduction),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            
            nn.Conv3d(in_channels=64, out_channels=128, kernel_size=(1, 3, 3), padding=(0, 1, 1), groups=1),
            nn.BatchNorm3d(128),
            nn.ReLU(inplace=True),
            SEBlock(128, reduction=reduction),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            nn.Dropout3d(dropout_rate),

            nn.Conv3d(in_channels=128, out_channels=256, kernel_size=(1, 3, 3), padding=(0, 1, 1), groups=1),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            SEBlock(256, reduction=reduction),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            nn.Dropout3d(dropout_rate),
        )
        # Adaptive pool to make output size (batch_size, channels, 1, 1, 1)
        self.adaptive_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
        
        # Fully connected layer
        self.fc_layer = nn.Sequential(
            nn.Linear(256, 2048),
            nn.ReLU(inplace=True),
            nn.Dropout(0.4),
            nn.Linear(2048, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, num_classes)
        )

        self._initialize_weights()

    # Initialize weights using Xavier initialization to improve convergence
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                init.xavier_normal_(m.weight)
            elif isinstance(m, nn.Linear):
                init.xavier_normal_(m.weight)

    def forward(self, x):
        x = self.conv_layer(x)
        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layer(x)
        return x

### 3. Model training

In [50]:
import optuna
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.optim import lr_scheduler

# Function to save the checkpoint of the model
def save_checkpoint(model, optimizer, epoch, filepath='best_gesture_recog.pth'):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)

# Function to load a checkpoint and restore model and optimizer states
def load_checkpoint(model, optimizer, filepath='best_gesture_recog.pth'):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return checkpoint['epoch']


def main_training_loop(epoch, model, train_dataloader, criterion, optimizer, writer, use_cuda, scaler):
    model.train()
    # Initialize metrics
    epoch_loss = 0.0
    correct_train = 0
    total_train = 0
    y_true_train = []
    y_pred_train = []
    # Iterate through each batch from the training data
    for i, (inputs, labels) in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")):
        if use_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
        
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass with Automatic Mixed Precision training
        with autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        # Backward pass and optimization
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            # Clip gradients to avoid "exploding gradient" problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        # Accumulate metrics for this epoch
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        epoch_loss += loss.item() * inputs.size(0)

        # Save the true and predicted labels for further metrics calculation
        y_true_train.extend(labels.cpu().numpy())
        y_pred_train.extend(torch.argmax(outputs, dim=1).cpu().numpy())

    # Calculate and log metrics for this epoch
    epoch_loss /= len(train_dataloader.dataset)
    train_accuracy = 100 * correct_train / total_train 
    f1 = f1_score(y_true_train, y_pred_train, average='weighted')
    precision = precision_score(y_true_train, y_pred_train, average='weighted')
    recall = recall_score(y_true_train, y_pred_train, average='weighted')
    
    # Log metrics to TensorBoard
    writer.add_scalar('Loss/train', epoch_loss, epoch)
    writer.add_scalar('Accuracy/train', train_accuracy, epoch)
    writer.add_scalar('F1/train', f1, epoch)
    writer.add_scalar('Precision/train', precision, epoch)
    writer.add_scalar('Recall/train', recall, epoch)

    print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, train_accuracy))
    print(
        f'Train F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

    return epoch_loss, train_accuracy


def validation_loop(epoch, model, val_dataloader, criterion, writer, use_cuda):
    model.eval()
    # Initialize metrics
    val_loss = 0.0
    correct_val = 0 
    total_val = 0
    y_true_val = []
    y_pred_val = []

    # Iterate through each batch from the validation data
    # We don't need to calculate gradients here, so we use torch.no_grad() 
    # context manager to save memory
    with torch.no_grad():
        for i, (inputs, labels, ) in enumerate(tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}")):
            if use_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Accumulate metrics for this epoch
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

            # Save the true and predicted labels for further metrics calculation
            y_true_val.extend(labels.cpu().numpy())
            y_pred_val.extend(predicted.cpu().numpy())

    # Calculate metrics for this epoch
    val_loss /= len(val_dataloader.dataset)
    val_accuracy = 100 * correct_val / total_val
    f1 = f1_score(y_true_val, y_pred_val, average='weighted')
    precision = precision_score(y_true_val, y_pred_val, average='weighted', zero_division=1)
    recall = recall_score(y_true_val, y_pred_val, average='weighted')

    # Log metrics to TensorBoard
    writer.add_scalar('Loss/val', val_loss, epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)
    writer.add_scalar('F1/val', f1, epoch)
    writer.add_scalar('Precision/val', precision, epoch)
    writer.add_scalar('Recall/val', recall, epoch)
    print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_accuracy))
    print(
        f'Val F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')


    return val_loss, val_accuracy


def objective(trial, train_dataloader, val_dataloader):
    config = {
        'num_classes': 5,
        'num_epochs': 50,
        'log_interval': 10,
        'batch_size': 32,
        'early_stopping_patience': 5,
        'lr': trial.suggest_float('lr', 1e-5, 1e-3, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-10, 1e-3, log=True),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5),
        'reduction': trial.suggest_int('reduction', 8, 32, step=4)
    }

    model = Deep3DCNN(num_classes=config['num_classes'],
                      reduction=config['reduction'], dropout_rate=config['dropout_rate'])
    if torch.cuda.is_available():
        model = model.cuda()

    optimizer = torch.optim.Adam(
        model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    criterion = nn.CrossEntropyLoss()
    scheduler = lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min', patience=config['early_stopping_patience'], verbose=True)

    # Check CUDA availability and initialize GradScaler
    use_cuda = torch.cuda.is_available()
    scaler = GradScaler() if use_cuda else None

    # Early Stopping
    best_val_loss = float('inf')
    patience_counter = 0

    # TensorBoard Writer
    writer = SummaryWriter()

    # Training and Validation Loop
    for epoch in range(config['num_epochs']):
        epoch_loss, train_accuracy = main_training_loop(
            epoch, model, train_dataloader, criterion, optimizer, writer, use_cuda, scaler)
        val_loss, val_accuracy = validation_loop(
            epoch, model, val_dataloader, criterion, writer, use_cuda)

        scheduler.step(val_loss)

        # Early stopping and model saving logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            writer.add_scalar('Best/val_loss', best_val_loss, best_epoch)
            print(
                f"New best validation loss: {best_val_loss} at epoch {best_epoch}")
            save_checkpoint(model, optimizer, epoch)
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= config['early_stopping_patience']:
            print(
                f"Early stopping after {config['early_stopping_patience']} epochs with no improvement.")
            # Load the best checkpoint
            load_checkpoint(model, optimizer)
            break

    return val_loss


if __name__ == '__main__':
    # Optuna study to find the best hyperparameters
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, train_loader,
                   val_loader), n_trials=50)  # Number of trials

    # Results
    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    trial = study.best_trial

    print('Value: ', trial.value)
    print('Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')


[I 2023-09-20 17:52:10,883] A new study created in memory with name: no-name-548b9b6a-0cb3-424e-8f1e-70b79f785b7c
Training Epoch 1:  31%|███       | 51/166 [02:13<05:04,  2.65s/it]

### 4. Model evaluation

In [4]:
transform = transforms.Compose([
    transforms.Resize((120, 160)),
    transforms.ToTensor(),
])

In [None]:
from collections import deque

# Initialize the camera
camera = cv2.VideoCapture(0)

# Initialize the model and set it to evaluation mode
model = Deep3DCNN(num_classes=5)
model.load_state_dict(torch.load('best_gesture_recog.pth'))
model.eval()

# Initialize deque for storing recent frames
recent_frames = deque(maxlen=30)

In [None]:
while True:
    ret, frame = camera.read()
    if not ret:
        print("Failed to grab frame")
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    rgb_frame = Image.fromarray(rgb_frame)
    transformed_frame = transform(rgb_frame)

    # Add the frame to deque
    recent_frames.append(transformed_frame)

    if len(recent_frames) == 30:
        # Make a prediction
        test_sample = torch.stack(list(recent_frames), dim=0).unsqueeze(0)
        if torch.cuda.is_available():
            test_sample = test_sample.cuda()

        with torch.no_grad():
            output = model(test_sample)
            _, prediction = torch.max(output.data, 1)

        print(f"Predicted Gesture: {prediction.item()}")

        # Display prediction on the frame
        cv2.putText(frame, f"Gesture: {prediction.item()}", (10, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Display the resulting frame
    cv2.imshow("Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

#### Clean up

In [None]:
#Clean up
camera.release()
cv2.destroyAllWindows()