In [1]:
import os
import torch
import torch.nn as nn
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
from collections import defaultdict
import numpy as np
from PIL import Image
from tqdm import tqdm

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cuda


In [None]:
DATA_DIR = 'training'  # Folder with class subdirectories
BATCH_SIZE = 32
NUM_CLASSES = 32
EPOCHS = 10
IMAGE_SIZE = 224
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PATIENCE = 3 


transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],  # ImageNet means
                         [0.229, 0.224, 0.225])   # ImageNet stds
])


class GestureDataset(datasets.ImageFolder):
    def __getitem__(self, index):
        path, label = self.samples[index]
        video_id = os.path.basename(path).split('.')[0]  # e.g., 4215
        image = self.loader(path)
        image = self.transform(image)
        return image, label, video_id


full_dataset = GestureDataset(DATA_DIR, transform=transform)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class EnhancedResNet(nn.Module):
    def __init__(self, num_classes=32):
        super(EnhancedResNet, self).__init__()
        
        # Load pre-trained ResNet50
        self.resnet = models.resnet50( weights=models.ResNet50_Weights.DEFAULT)
        
        # Remove the original fully connected layer
        self.resnet.fc = nn.Identity()
        
        # Additional CNN layers
        self.extra_conv = nn.Sequential(
            nn.Conv2d(2048, 1024, kernel_size=3, padding=1),  # Additional conv layer
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))  # Global average pooling
        )
        
        # Final classifier
        self.fc = nn.Linear(1024, num_classes)
        
    def forward(self, x):
        # ResNet feature extraction
        x = self.resnet.conv1(x)
        x = self.resnet.bn1(x)
        x = self.resnet.relu(x)
        x = self.resnet.maxpool(x)
        
        x = self.resnet.layer1(x)
        x = self.resnet.layer2(x)
        x = self.resnet.layer3(x)
        x = self.resnet.layer4(x)  # Output shape: [batch, 2048, 7, 7]
        
        # Additional CNN processing
        x = self.extra_conv(x)  # Output shape: [batch, 1024, 1, 1]
        
        # Flatten and classify
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        return x

# Initialize model
model = EnhancedResNet(num_classes=NUM_CLASSES).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


best_val_loss = float('inf')
patience_counter = 0


def train_model():
    global best_val_loss, patience_counter
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        for images, labels, _ in train_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)

        train_accuracy = correct_train / total_train
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {train_accuracy:.4f}")

        # Validate after each epoch
        val_loss = validate_model()

        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            break


def validate_model():
    model.eval()
    correct_val = 0
    total_val = 0
    val_loss = 0.0
    with torch.no_grad():
        for images, labels, _ in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_val += (predicted == labels).sum().item()
            total_val += labels.size(0)

    val_accuracy = correct_val / total_val
    val_loss /= len(val_loader)
    print(f"Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {val_loss:.4f}")

    return val_loss


train_model()

Epoch 1/10, Loss: 1.6159, Train Accuracy: 0.4970
Validation Accuracy: 0.5756, Validation Loss: 1.3342
Epoch 2/10, Loss: 1.1916, Train Accuracy: 0.6143
Validation Accuracy: 0.6188, Validation Loss: 1.1499
Epoch 3/10, Loss: 0.9810, Train Accuracy: 0.6707
Validation Accuracy: 0.6607, Validation Loss: 1.0399
Epoch 4/10, Loss: 0.8211, Train Accuracy: 0.7197
Validation Accuracy: 0.6876, Validation Loss: 0.9699
Epoch 5/10, Loss: 0.6909, Train Accuracy: 0.7595
Validation Accuracy: 0.6916, Validation Loss: 0.9176
Epoch 6/10, Loss: 0.5886, Train Accuracy: 0.7882
Validation Accuracy: 0.7001, Validation Loss: 0.9569
Epoch 7/10, Loss: 0.5094, Train Accuracy: 0.8148
Validation Accuracy: 0.7152, Validation Loss: 0.9052
Epoch 8/10, Loss: 0.4410, Train Accuracy: 0.8366
Validation Accuracy: 0.7077, Validation Loss: 0.9326
Epoch 9/10, Loss: 0.3874, Train Accuracy: 0.8524
Validation Accuracy: 0.7259, Validation Loss: 0.9203
Epoch 10/10, Loss: 0.3466, Train Accuracy: 0.8699
Validation Accuracy: 0.7246, Val

In [None]:
model = EnhancedResNet(num_classes=NUM_CLASSES).to(DEVICE)

# Load the saved state_dict
model.load_state_dict(torch.load(r'ours_state_dict.pt'))

model.eval()

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(DEVICE)

def topk_accuracy(output, target, k=5):
    with torch.no_grad():
        # Get the top k predictions
        _, pred = output.topk(k, 1, True, True)
        
        # Check if the target label is in the top k predictions
        correct = pred.eq(target.view(-1, 1).expand_as(pred))
        
        # Calculate the average accuracy
        topk_acc = correct.float().sum(1).mean().item()
        return topk_acc
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
top1_correct = 0
top5_correct = 0
total = 0

with torch.no_grad():
    for images, labels, _ in val_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        outputs = model(images)
        
        # Calculate top-1 accuracy
        top1_correct += topk_accuracy(outputs, labels, k=1) * images.size(0)
        
        # Calculate top-5 accuracy
        top5_correct += topk_accuracy(outputs, labels, k=5) * images.size(0)
        
        total += images.size(0)

top1_acc = top1_correct / total
top5_acc = top5_correct / total

print(f"Top-1 Accuracy: {top1_acc * 100:.2f}%")
print(f"Top-5 Accuracy: {top5_acc * 100:.2f}%")


Top-1 Accuracy: 85.71%
Top-5 Accuracy: 99.52%
