#### Import libraries

In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim
from PIL import Image
import pandas as pd
import cv2
import zipfile
print(torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


1.13.1


#### Setup the environment

In [2]:
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
print('Current directory: ', current_dir)

Current directory:  /Users/oksanaerm/gesture_recognition


In [3]:
data_dir = os.path.join(current_dir, 'data/')
train_dir = os.path.join(data_dir, 'train/')
val_dir = os.path.join(data_dir, 'val/')
test_dir = os.path.join(data_dir, 'test/')
print('Data directory: ', data_dir)

Data directory:  /Users/oksanaerm/gesture_recognition/data/


#### Load the data

In [None]:
! pip install kaggle

In [None]:
with open("kaggle.json", "r") as f:
    import json
    api_token = json.load(f)
os.environ['KAGGLE_USERNAME'] = api_token["username"]
os.environ['KAGGLE_KEY'] = api_token["key"]
print('Kaggle API token loaded.')

In [None]:
! kaggle datasets download -d imsparsh/gesture-recognition -p data

# Check for the presence of the dataset file
if any(file.endswith('.zip') for file in data_dir):
    print("Dataset downloaded successfully.")
else:
    print("Dataset not found.")

In [None]:
with zipfile.ZipFile(os.path.join(data_dir, 'gesture-recognition.zip'), 'r') as zip_ref:
    zip_ref.extractall(data_dir)
    print("Dataset extracted successfully.")
os.remove(os.path.join(data_dir, 'gesture-recognition.zip'))

### 1. Data preparation

In [4]:
def get_transform(image_size: tuple, to_pil: bool = True) -> transforms.Compose:
    transform_list = []
    if to_pil:
        transform_list.append(transforms.ToPILImage())
    transform_list.extend([
        transforms.Resize(image_size),
        transforms.ToTensor()
    ])
    return transforms.Compose(transform_list)

In [5]:
class GestureDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.gesture_folders = sorted(
            [folder for folder in os.listdir(self.root_dir)])
        self.class_to_label = {
            'Thumbs_Down': 0, 
            'Right_Swipe': 1, 
            'Thumbs_Up': 2, 
            'Left_Swipe': 3, 
            'Stop': 4}
        self.validate_folders_and_labels()
    
    def validate_folders_and_labels(self):
        # Loop through each folder in the root directory and check if it matches any label
        unmatched_folders = []
        for folder in self.gesture_folders:
            normalized_folder_name = folder.replace('_', ' ').lower()
            match_found = False
            for gesture_type in self.class_to_label.keys():
                normalized_gesture_type = gesture_type.replace('_', ' ').lower()
                if normalized_gesture_type in normalized_folder_name:
                    match_found = True
                    break
            if not match_found:
                unmatched_folders.append(folder)

        # If there are unmatched folders, raise a warning or error
        if unmatched_folders:
            warning_msg = f"Warning: Unmatched folders found that don't correspond to any label: {unmatched_folders}"
            print(warning_msg)

        # Check if there are unused labels in the dictionary
        unused_labels = set(self.class_to_label.keys())
        for folder in self.gesture_folders:
            normalized_folder_name = folder.replace('_', ' ').lower()
            for gesture_type in self.class_to_label.keys():
                normalized_gesture_type = gesture_type.replace('_', ' ').lower()
                if normalized_gesture_type in normalized_folder_name:
                    unused_labels.discard(gesture_type)

        if unused_labels:
            warning_msg = f"Warning: Unused labels found that don't correspond to any folder: {list(unused_labels)}"
            print(warning_msg)

    def __len__(self):
        return len(self.gesture_folders)

    def __getitem__(self, idx):
        gesture_path = os.path.join(self.root_dir, self.gesture_folders[idx])
        # Normalize folder name by replacing underscores and converting to lower case
        normalized_folder_name = self.gesture_folders[idx].replace(
            '_', ' ').lower()
        # Try to match normalized folder name with normalized keys from the dictionary
        for gesture_type in self.class_to_label.keys():
            normalized_gesture_type = gesture_type.replace('_', ' ').lower()
            if normalized_gesture_type in normalized_folder_name:
                gesture_class_str = gesture_type
                break
        else:
            raise ValueError(
                f"Unknown gesture type in folder name: {self.gesture_folders[idx]}"
                f"Recognized types are: {', '.join(self.class_to_label.keys())}")

        # Use the dictionary to get the integer label
        gesture_class = self.class_to_label.get(gesture_class_str, -1)

        frames = []
        for img_name in sorted(os.listdir(gesture_path)):
            img_path = os.path.join(gesture_path, img_name)
            image = Image.open(img_path).convert('RGB') 
            image = np.array(image) 
            if self.transform:
                image = self.transform(image)
            frames.append(image)

        frames_tensor = torch.stack(frames, dim=0)
        return frames_tensor, gesture_class

In [6]:
BATCH_SIZE = 4
IMAGE_SIZE = (120, 160)
SHUFFLE = True

# Initialize transforms
transform_data_prep = get_transform((120, 160), to_pil=True)

# Initialize train dataset and DataLoader
train_dataset = GestureDataset(
    root_dir=train_dir, transform=transform_data_prep)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

# Initialize validation dataset and DataLoader
val_dataset = GestureDataset(root_dir=val_dir, transform=transform_data_prep)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)


### 2. Model architecture

In [7]:
import torch.nn.init as init

# Initialize weights
class SEBlock(nn.Module):  # to recalibrate the feature maps
    def __init__(self, in_channels, reduction=16):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool3d(1)
        self.fc = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction, in_channels, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1, 1)
        return x * y.expand_as(x)
# This could be quite useful for recognizing gestures where some features 
# (like the shape and movement of the hand) are more critical than others 
# (like the background).

# Define the model
class Deep3DCNN(nn.Module):
    def __init__(self, num_classes, reduction=16, dropout_rate=0.2):
        super(Deep3DCNN, self).__init__()
        
        # Convolutional layers
        self.conv_layer = nn.Sequential(
            nn.Conv3d(in_channels=30, out_channels=64, kernel_size=(1, 3, 3), padding=(0, 1, 1), groups=2),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
            SEBlock(64, reduction=reduction),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            
            nn.Conv3d(in_channels=64, out_channels=128, kernel_size=(1, 3, 3), padding=(0, 1, 1), groups=4),
            nn.BatchNorm3d(128),
            nn.ReLU(inplace=True),
            SEBlock(128, reduction=reduction),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            nn.Dropout3d(dropout_rate),

            nn.Conv3d(in_channels=128, out_channels=256, kernel_size=(1, 3, 3), padding=(0, 1, 1), groups=4),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            SEBlock(256, reduction=reduction),
            nn.MaxPool3d((1, 2, 2), stride=(1, 2, 2)),
            nn.Dropout3d(dropout_rate),
        )
        # Adaptive pool to make output size (batch_size, channels, 1, 1, 1)
        self.adaptive_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
        
        # Fully connected layer
        self.fc_layer = nn.Sequential(
            nn.Linear(256, 2048),
            nn.ReLU(inplace=True),
            nn.Dropout(0.4),
            nn.Linear(2048, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, num_classes)
        )

        self._initialize_weights()

    # Initialize weights using Xavier initialization to improve convergence
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                init.xavier_normal_(m.weight)
            elif isinstance(m, nn.Linear):
                init.xavier_normal_(m.weight)

    def forward(self, x):
        x = self.conv_layer(x)
        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layer(x)
        return x

### 3. Model training

In [61]:
import optuna
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.optim import lr_scheduler

# Function to save the checkpoint of the model
def save_checkpoint(model, optimizer, epoch, filepath='best_gesture_recog.pth'):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)

# Function to load a checkpoint and restore model and optimizer states
def load_checkpoint(model, optimizer, filepath='best_gesture_recog.pth'):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return checkpoint['epoch']


def main_training_loop(epoch, model, train_dataloader, criterion, optimizer, writer, use_cuda, scaler):
    model.train()
    # Initialize metrics
    epoch_loss = 0.0
    correct_train = 0
    total_train = 0
    y_true_train = []
    y_pred_train = []
    # Iterate through each batch from the training data
    for i, (inputs, labels) in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")):
        if use_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
        
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass with Automatic Mixed Precision training
        with autocast(enabled=use_cuda):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        # Backward pass and optimization
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            # Clip gradients to avoid "exploding gradient" problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        # Accumulate metrics for this epoch
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        epoch_loss += loss.item() * inputs.size(0)

        # Save the true and predicted labels for further metrics calculation
        y_true_train.extend(labels.cpu().numpy())
        y_pred_train.extend(torch.argmax(outputs, dim=1).cpu().numpy())

    # Calculate and log metrics for this epoch
    epoch_loss /= len(train_dataloader.dataset)
    train_accuracy = 100 * correct_train / total_train 
    f1 = f1_score(y_true_train, y_pred_train, average='weighted')
    precision = precision_score(y_true_train, y_pred_train, average='weighted')
    recall = recall_score(y_true_train, y_pred_train, average='weighted')
    
    # Log metrics to TensorBoard
    writer.add_scalar('Loss/train', epoch_loss, epoch)
    writer.add_scalar('Accuracy/train', train_accuracy, epoch)
    writer.add_scalar('F1/train', f1, epoch)
    writer.add_scalar('Precision/train', precision, epoch)
    writer.add_scalar('Recall/train', recall, epoch)

    print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, train_accuracy))
    print(
        f'Train F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

    return epoch_loss, train_accuracy


def validation_loop(epoch, model, val_dataloader, criterion, writer, use_cuda):
    model.eval()
    # Initialize metrics
    val_loss = 0.0
    correct_val = 0 
    total_val = 0
    y_true_val = []
    y_pred_val = []

    # Iterate through each batch from the validation data
    # We don't need to calculate gradients here, so we use torch.no_grad() 
    # context manager to save memory
    with torch.no_grad():
        for i, (inputs, labels, ) in enumerate(tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}")):
            if use_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Accumulate metrics for this epoch
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

            # Save the true and predicted labels for further metrics calculation
            y_true_val.extend(labels.cpu().numpy())
            y_pred_val.extend(predicted.cpu().numpy())

    # Calculate metrics for this epoch
    val_loss /= len(val_dataloader.dataset)
    val_accuracy = 100 * correct_val / total_val
    f1 = f1_score(y_true_val, y_pred_val, average='weighted')
    precision = precision_score(y_true_val, y_pred_val, average='weighted', zero_division=1)
    recall = recall_score(y_true_val, y_pred_val, average='weighted')

    # Log metrics to TensorBoard
    writer.add_scalar('Loss/val', val_loss, epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)
    writer.add_scalar('F1/val', f1, epoch)
    writer.add_scalar('Precision/val', precision, epoch)
    writer.add_scalar('Recall/val', recall, epoch)
    print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_accuracy))
    print(
        f'Val F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')


    return val_loss, val_accuracy


def objective(trial, train_dataloader, val_dataloader):
    config = {
        'num_classes': 5,
        'num_epochs': 50,
        'log_interval': 10,
        'batch_size': 32,
        'early_stopping_patience': 5,
        'lr': trial.suggest_float('lr', 1e-5, 1e-3, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-10, 1e-3, log=True),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5),
        'reduction': trial.suggest_int('reduction', 8, 32, step=4)
    }

    model = Deep3DCNN(num_classes=config['num_classes'],
                      reduction=config['reduction'], dropout_rate=config['dropout_rate'])
    if torch.cuda.is_available():
        model = model.cuda()

    optimizer = torch.optim.Adam(
        model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    criterion = nn.CrossEntropyLoss()
    scheduler = lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min', patience=config['early_stopping_patience'], verbose=True)

    # Check CUDA availability and initialize GradScaler
    use_cuda = torch.cuda.is_available()
    scaler = GradScaler() if use_cuda else None

    # Inform the user only once about CUDA status
    if use_cuda:
        print("CUDA is available. Running on GPU.")
    else:
        print("Warning: CUDA not available. Running on CPU.")

    # Early Stopping
    best_val_loss = float('inf')
    patience_counter = 0

    # TensorBoard Writer
    writer = SummaryWriter()

    # Training and Validation Loop
    for epoch in range(config['num_epochs']):
        epoch_loss, train_accuracy = main_training_loop(
            epoch, model, train_dataloader, criterion, optimizer, writer, use_cuda, scaler)
        val_loss, val_accuracy = validation_loop(
            epoch, model, val_dataloader, criterion, writer, use_cuda)

        scheduler.step(val_loss)

        # Early stopping and model saving logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            writer.add_scalar('Best/val_loss', best_val_loss, best_epoch)
            print(
                f"New best validation loss: {best_val_loss} at epoch {best_epoch+1}")
            save_checkpoint(model, optimizer, epoch)
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= config['early_stopping_patience']:
            print(
                f"Early stopping after {config['early_stopping_patience']} epochs with no improvement.")
            # Load the best checkpoint
            load_checkpoint(model, optimizer)
            break
        
    torch.cuda.empty_cache()
    return val_loss


if __name__ == '__main__':
    # Optuna study to find the best hyperparameters
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, train_loader,
                   val_loader), n_trials=5)  # Number of trials

    # Results
    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    trial = study.best_trial

    print('Value: ', trial.value)
    print('Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')


Training Epoch 14: 100%|██████████| 166/166 [03:39<00:00,  1.32s/it]


Train Loss: 1.2697 Acc: 42.8356
Train F1: 0.4218, Precision: 0.4348, Recall: 0.4284


Validation Epoch 14: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 1.0453 Acc: 60.0000
Val F1: 0.5679, Precision: 0.5993, Recall: 0.6000
New best validation loss: 1.045254635810852 at epoch 14


Training Epoch 15: 100%|██████████| 166/166 [03:31<00:00,  1.28s/it]


Train Loss: 1.3032 Acc: 41.6290
Train F1: 0.4176, Precision: 0.4202, Recall: 0.4163


Validation Epoch 15: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 1.1345 Acc: 48.0000
Val F1: 0.4376, Precision: 0.5076, Recall: 0.4800


Training Epoch 16: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.2583 Acc: 43.7406
Train F1: 0.4310, Precision: 0.4278, Recall: 0.4374


Validation Epoch 16: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]


Val Loss: 1.0623 Acc: 50.0000
Val F1: 0.4449, Precision: 0.5204, Recall: 0.5000


Training Epoch 17: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.2602 Acc: 42.6848
Train F1: 0.4220, Precision: 0.4231, Recall: 0.4268


Validation Epoch 17: 100%|██████████| 25/25 [00:18<00:00,  1.33it/s]


Val Loss: 1.0534 Acc: 62.0000
Val F1: 0.5988, Precision: 0.6268, Recall: 0.6200


Training Epoch 18: 100%|██████████| 166/166 [03:31<00:00,  1.28s/it]


Train Loss: 1.2460 Acc: 45.7014
Train F1: 0.4578, Precision: 0.4612, Recall: 0.4570


Validation Epoch 18: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 1.0502 Acc: 55.0000
Val F1: 0.5151, Precision: 0.6149, Recall: 0.5500


Training Epoch 19: 100%|██████████| 166/166 [03:34<00:00,  1.29s/it]


Train Loss: 1.2756 Acc: 42.6848
Train F1: 0.4222, Precision: 0.4206, Recall: 0.4268


Validation Epoch 19: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 1.0418 Acc: 54.0000
Val F1: 0.4994, Precision: 0.6237, Recall: 0.5400
New best validation loss: 1.0418296957015991 at epoch 19


Training Epoch 20: 100%|██████████| 166/166 [03:33<00:00,  1.28s/it]


Train Loss: 1.2635 Acc: 45.3997
Train F1: 0.4500, Precision: 0.4477, Recall: 0.4540


Validation Epoch 20: 100%|██████████| 25/25 [00:18<00:00,  1.32it/s]


Val Loss: 1.0141 Acc: 64.0000
Val F1: 0.6182, Precision: 0.6564, Recall: 0.6400
New best validation loss: 1.0140574407577514 at epoch 20


Training Epoch 21: 100%|██████████| 166/166 [03:31<00:00,  1.28s/it]


Train Loss: 1.2366 Acc: 43.8914
Train F1: 0.4361, Precision: 0.4365, Recall: 0.4389


Validation Epoch 21: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 1.0370 Acc: 63.0000
Val F1: 0.6059, Precision: 0.6577, Recall: 0.6300


Training Epoch 22: 100%|██████████| 166/166 [03:30<00:00,  1.27s/it]


Train Loss: 1.2688 Acc: 42.2323
Train F1: 0.4151, Precision: 0.4224, Recall: 0.4223


Validation Epoch 22: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]


Val Loss: 0.9972 Acc: 66.0000
Val F1: 0.6382, Precision: 0.6791, Recall: 0.6600
New best validation loss: 0.9972357785701752 at epoch 22


Training Epoch 23: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.2277 Acc: 45.8522
Train F1: 0.4532, Precision: 0.4515, Recall: 0.4585


Validation Epoch 23: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 1.0033 Acc: 66.0000
Val F1: 0.6236, Precision: 0.6631, Recall: 0.6600


Training Epoch 24: 100%|██████████| 166/166 [03:30<00:00,  1.27s/it]


Train Loss: 1.2393 Acc: 46.7572
Train F1: 0.4650, Precision: 0.4639, Recall: 0.4676


Validation Epoch 24: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 0.9905 Acc: 64.0000
Val F1: 0.6150, Precision: 0.6837, Recall: 0.6400
New best validation loss: 0.9904718971252442 at epoch 24


Training Epoch 25: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.1923 Acc: 50.8296
Train F1: 0.5062, Precision: 0.5061, Recall: 0.5083


Validation Epoch 25: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9867 Acc: 66.0000
Val F1: 0.6316, Precision: 0.6687, Recall: 0.6600
New best validation loss: 0.9866794180870057 at epoch 25


Training Epoch 26: 100%|██████████| 166/166 [03:34<00:00,  1.29s/it]


Train Loss: 1.3098 Acc: 45.8522
Train F1: 0.4554, Precision: 0.4568, Recall: 0.4585


Validation Epoch 26: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9683 Acc: 64.0000
Val F1: 0.6304, Precision: 0.6680, Recall: 0.6400
New best validation loss: 0.968312736749649 at epoch 26


Training Epoch 27: 100%|██████████| 166/166 [03:32<00:00,  1.28s/it]


Train Loss: 1.2048 Acc: 48.4163
Train F1: 0.4816, Precision: 0.4832, Recall: 0.4842


Validation Epoch 27: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9791 Acc: 66.0000
Val F1: 0.6241, Precision: 0.6653, Recall: 0.6600


Training Epoch 28: 100%|██████████| 166/166 [03:37<00:00,  1.31s/it]


Train Loss: 1.1941 Acc: 48.8688
Train F1: 0.4887, Precision: 0.4897, Recall: 0.4887


Validation Epoch 28: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]


Val Loss: 0.9915 Acc: 62.0000
Val F1: 0.6095, Precision: 0.6525, Recall: 0.6200


Training Epoch 29: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.2370 Acc: 47.0588
Train F1: 0.4698, Precision: 0.4697, Recall: 0.4706


Validation Epoch 29: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 0.9991 Acc: 59.0000
Val F1: 0.5724, Precision: 0.6351, Recall: 0.5900


Training Epoch 30: 100%|██████████| 166/166 [03:30<00:00,  1.27s/it]


Train Loss: 1.2456 Acc: 43.7406
Train F1: 0.4373, Precision: 0.4389, Recall: 0.4374


Validation Epoch 30: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9816 Acc: 64.0000
Val F1: 0.6164, Precision: 0.6462, Recall: 0.6400


Training Epoch 31: 100%|██████████| 166/166 [03:31<00:00,  1.28s/it]


Train Loss: 1.2185 Acc: 47.0588
Train F1: 0.4695, Precision: 0.4695, Recall: 0.4706


Validation Epoch 31: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 0.9392 Acc: 66.0000
Val F1: 0.6418, Precision: 0.6698, Recall: 0.6600
New best validation loss: 0.9391726022958755 at epoch 31


Training Epoch 32: 100%|██████████| 166/166 [03:31<00:00,  1.28s/it]


Train Loss: 1.2232 Acc: 46.7572
Train F1: 0.4617, Precision: 0.4647, Recall: 0.4676


Validation Epoch 32: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9189 Acc: 67.0000
Val F1: 0.6581, Precision: 0.6902, Recall: 0.6700
New best validation loss: 0.9188961338996887 at epoch 32


Training Epoch 33: 100%|██████████| 166/166 [03:32<00:00,  1.28s/it]


Train Loss: 1.2020 Acc: 50.2262
Train F1: 0.5006, Precision: 0.4995, Recall: 0.5023


Validation Epoch 33: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9326 Acc: 64.0000
Val F1: 0.6059, Precision: 0.6753, Recall: 0.6400


Training Epoch 34: 100%|██████████| 166/166 [03:32<00:00,  1.28s/it]


Train Loss: 1.1977 Acc: 47.2097
Train F1: 0.4687, Precision: 0.4673, Recall: 0.4721


Validation Epoch 34: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.9505 Acc: 65.0000
Val F1: 0.6293, Precision: 0.6888, Recall: 0.6500


Training Epoch 35: 100%|██████████| 166/166 [03:34<00:00,  1.29s/it]


Train Loss: 1.2197 Acc: 47.5113
Train F1: 0.4715, Precision: 0.4703, Recall: 0.4751


Validation Epoch 35: 100%|██████████| 25/25 [00:18<00:00,  1.32it/s]


Val Loss: 0.9575 Acc: 57.0000
Val F1: 0.5585, Precision: 0.6052, Recall: 0.5700


Training Epoch 36: 100%|██████████| 166/166 [03:30<00:00,  1.27s/it]


Train Loss: 1.1891 Acc: 51.1312
Train F1: 0.5087, Precision: 0.5091, Recall: 0.5113


Validation Epoch 36: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 0.9970 Acc: 58.0000
Val F1: 0.5624, Precision: 0.5958, Recall: 0.5800


Training Epoch 37: 100%|██████████| 166/166 [03:32<00:00,  1.28s/it]


Train Loss: 1.1728 Acc: 48.5671
Train F1: 0.4825, Precision: 0.4835, Recall: 0.4857


Validation Epoch 37: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.8738 Acc: 68.0000
Val F1: 0.6635, Precision: 0.6815, Recall: 0.6800
New best validation loss: 0.8737629687786103 at epoch 37


Training Epoch 38: 100%|██████████| 166/166 [03:30<00:00,  1.27s/it]


Train Loss: 1.1974 Acc: 49.7738
Train F1: 0.4945, Precision: 0.4934, Recall: 0.4977


Validation Epoch 38: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.8892 Acc: 68.0000
Val F1: 0.6555, Precision: 0.7173, Recall: 0.6800


Training Epoch 39: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.1164 Acc: 55.6561
Train F1: 0.5553, Precision: 0.5559, Recall: 0.5566


Validation Epoch 39: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Val Loss: 0.9319 Acc: 61.0000
Val F1: 0.5964, Precision: 0.6569, Recall: 0.6100


Training Epoch 40: 100%|██████████| 166/166 [03:31<00:00,  1.28s/it]


Train Loss: 1.1674 Acc: 50.5279
Train F1: 0.5061, Precision: 0.5075, Recall: 0.5053


Validation Epoch 40: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Val Loss: 0.8832 Acc: 65.0000
Val F1: 0.6295, Precision: 0.6938, Recall: 0.6500


Training Epoch 41: 100%|██████████| 166/166 [03:31<00:00,  1.27s/it]


Train Loss: 1.1722 Acc: 48.5671
Train F1: 0.4854, Precision: 0.4863, Recall: 0.4857


Validation Epoch 41: 100%|██████████| 25/25 [00:18<00:00,  1.32it/s]


Val Loss: 0.9429 Acc: 62.0000
Val F1: 0.5961, Precision: 0.6605, Recall: 0.6200


Training Epoch 42: 100%|██████████| 166/166 [03:35<00:00,  1.30s/it]


Train Loss: 1.1856 Acc: 48.5671
Train F1: 0.4835, Precision: 0.4852, Recall: 0.4857


Validation Epoch 42: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]
[I 2023-09-28 03:21:02,120] Trial 4 finished with value: 0.9140139985084533 and parameters: {'lr': 1.878887835176021e-05, 'weight_decay': 1.0377220058734007e-10, 'dropout_rate': 0.29168643854284715, 'reduction': 8}. Best is trial 1 with value: 0.606099152341485.


Val Loss: 0.9140 Acc: 60.0000
Val F1: 0.5848, Precision: 0.6076, Recall: 0.6000
Early stopping after 5 epochs with no improvement.
Number of finished trials:  5
Best trial:
Value:  0.606099152341485
Params: 
    lr: 3.150586493930411e-05
    weight_decay: 4.8570077543617684e-08
    dropout_rate: 0.10545447017727612
    reduction: 24


### 4. Model evaluation

In [8]:
# Define the transformation for real-time prediction
transform_real_time = get_transform((120, 160), to_pil=False)

In [16]:
# Initialize the camera
camera = cv2.VideoCapture(1)
if not camera.isOpened():
    print("Error: Camera could not be opened.")
    exit(1)

In [17]:
from collections import deque

# Load the entire checkpoint dictionary
checkpoint = torch.load('best_gesture_recog.pth')

model = Deep3DCNN(num_classes=5, reduction=8)
model.load_state_dict(checkpoint['model_state_dict'], strict=False)
if torch.cuda.is_available():
    model = model.cuda()
model.eval()

# Initialize deque for storing recent frames
recent_frames = deque(maxlen=30)

In [18]:
frame_count = 0

while True:
    ret, frame = camera.read()
    if not ret:
            print("Failed to grab frame")
            break

    # Convert BGR to RGB and transform
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    rgb_frame = Image.fromarray(rgb_frame)
    transformed_frame = transform_real_time(rgb_frame)

    # Add the transformed frame to deque and continue with prediction-related operations
    recent_frames.append(transformed_frame)

    if len(recent_frames) == 30:
        frame_count += 1
        if frame_count % 10 == 0: 
            # Make a prediction every 10 frames
            test_sample = torch.stack(list(recent_frames), dim=0).unsqueeze(0)
            if torch.cuda.is_available():
                test_sample = test_sample.cuda()

            with torch.no_grad():
                output = model(test_sample)
                _, prediction = torch.max(output.data, 1)

            print(f"Predicted Gesture: {prediction.item()}")

            # Display prediction on the frame
            cv2.putText(frame, f"Gesture: {prediction.item()}", (10, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Display the resulting frame
    cv2.imshow("Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

Predicted Gesture: 0
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 2
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 0
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Gesture: 3
Predicted Ges

KeyboardInterrupt: 

#### Clean up

In [19]:
#Clean up
camera.release()
cv2.destroyAllWindows()