Import packages

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.models.video import mc3_18, MC3_18_Weights
from torchvision.transforms import Resize, Normalize, Compose, RandomHorizontalFlip, ColorJitter, RandomRotation
import os
import pandas as pd
from tqdm import tqdm
import matplotlib as plt

Install data (only do this if you do not have the dataset installed already)

In [1]:
!wget -O 20bnjester-v1-00.zip https://apigwx-aws.qualcomm.com/qsc/public/v1/api/download/software/dataset/AIDataset/Jester/20bnjester-v1-00
!wget -O 20bnjester-v1-01.zip https://apigwx-aws.qualcomm.com/qsc/public/v1/api/download/software/dataset/AIDataset/Jester/20bnjester-v1-01
!wget -O 20bnjester-v1-02.zip https://apigwx-aws.qualcomm.com/qsc/public/v1/api/download/software/dataset/AIDataset/Jester/20bnjester-v1-02

--2024-12-02 16:42:58--  https://apigwx-aws.qualcomm.com/qsc/public/v1/api/download/software/dataset/AIDataset/Jester/20bnjester-v1-00
Resolving apigwx-aws.qualcomm.com (apigwx-aws.qualcomm.com)... 18.66.147.21, 18.66.147.86, 18.66.147.120, ...
Connecting to apigwx-aws.qualcomm.com (apigwx-aws.qualcomm.com)|18.66.147.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10000000000 (9.3G) [binary/octet-stream]
Saving to: ‘20bnjester-v1-00.zip’


2024-12-02 16:47:59 (22.6 MB/s) - Connection closed at byte 7106814506. Retrying.

--2024-12-02 16:48:00--  (try: 2)  https://apigwx-aws.qualcomm.com/qsc/public/v1/api/download/software/dataset/AIDataset/Jester/20bnjester-v1-00
Connecting to apigwx-aws.qualcomm.com (apigwx-aws.qualcomm.com)|18.66.147.21|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 10000000000 (9.3G), 2893185494 (2.7G) remaining [binary/octet-stream]
Saving to: ‘20bnjester-v1-00.zip’

20bnjester-v1-00.zi 100%[+++++

In [7]:
!cat 20bnjester-v1-00.zip 20bnjester-v1-01.zip 20bnjester-v1-02.zip | tar zx

Initialize all the functions

In [19]:
class JesterDataset(Dataset):
    def __init__(self, data, data_path, num_frames=37, frame_size=(100, 176), mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
        self.data = data
        self.data_path = data_path
        self.num_frames = num_frames
        self.resize = Resize(frame_size, antialias=True)
        self.normalize = Normalize(mean, std)
        self.augment = Compose([
            RandomHorizontalFlip(),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            RandomRotation(degrees=15)
        ])

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        video_id, label = self.data.iloc[i]
        video_folder = os.path.join(self.data_path, str(video_id))
        video_tensor = self.load_videos(video_folder)
        return video_tensor, label

    def load_videos(self, video_folder):
        frames = []
        for frame in sorted(os.listdir(video_folder)):
            original_frame = read_image(os.path.join(video_folder, frame))
            frame = self.resize(original_frame)
            frame = self.augment(frame)
            frame = frame.float() / 255.0
            frame = self.normalize(frame)
            frames.append(frame)
        if len(frames) < self.num_frames:
            num_missing_frames = self.num_frames - len(frames)
            # take last frame and duplicate it to fill in missing frames
            frames.extend([frames[-1]] * num_missing_frames)
        elif len(frames) > self.num_frames:
            # uniformly choose frames to keep the features
            indices = torch.linspace(0, len(frames) - 1, self.num_frames, dtype=int)
            frames = [frames[i] for i in indices]
        video_tensor = torch.stack(frames).float() # convert to float because of mismatch with pretrained model weights
        video_tensor = video_tensor.permute(1, 0, 2, 3)
        return video_tensor

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def create_label_encoding(labels_path):
    labels_df = pd.read_csv(labels_path, header=None)
    label_encoding = {label: idx for idx, label in enumerate(labels_df[0])}
    return label_encoding

def split_data(labels_path):
    label_encoding = create_label_encoding(labels_path)
    
    train_data = pd.read_csv('jester-v1-train.csv', sep=';', header=None)
    test_data = pd.read_csv('jester-v1-validation.csv', sep=';', header=None)

    train_data = train_data.sample(frac=0.01, random_state=42)
    val_data = train_data.sample(frac=0.1, random_state=42)
    test_data = test_data.sample(frac=0.01, random_state=42)
    train_data = train_data.drop(val_data.index) # remove validation data from training data
    train_data[1] = train_data[1].map(label_encoding)
    val_data[1] = val_data[1].map(label_encoding)
    test_data[1] = test_data[1].map(label_encoding)
    
    return train_data, val_data, test_data

def train(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)

        for video, label in train_loader_tqdm:
            video = video.to(device)
            label = label.to(device)
            label_pred = model(video)
            loss = criterion(label_pred, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loader_tqdm.set_postfix(loss=loss.item())
            epoch_loss += loss.item()

        val_loss, accuracy = validate(model, val_loader, criterion, device)
        val_losses.append((val_loss, accuracy))
        print(f"Epoch: {epoch + 1}, Validation Loss: {val_loss} with accuracy: {accuracy}")
        model.train()

        if (epoch + 1) % 2 == 0:
            torch.save(model.state_dict(), f'model_{epoch + 1}.pth')

    plt.figure(figsize=(10, 6))
    plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Validation loss over epochs")
    plt.legend()
    plt.grid(True)
    plt.show()
    plt.savefig("model.png")

def validate(model, val_loader, criterion, device):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        correct_predictions = 0
        total = 0
        for video, label in val_loader:
            video = video.to(device)
            label = label.to(device)
            label_pred = model(video)
            loss = criterion(label_pred, label)
            total_loss += loss.item()

            _, predicted_labels = torch.max(label_pred, 1)
            correct_predictions += (predicted_labels == label).sum().item()
            total += label.size(0)

        avg_loss = total_loss / len(val_loader)
        accuracy = correct_predictions / total * 100
    return avg_loss, accuracy

def test(model, test_loader, criterion, device):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        correct_predictions = 0
        total = 0
        for video, label in test_loader:
            video = video.to(device)
            label = label.to(device)
            label_pred = model(video)
            loss = criterion(label_pred, label)
            total_loss += loss.item()

            _, predicted_labels = torch.max(label_pred, 1)
            correct_predictions += (predicted_labels == label).sum().item()
            total += label.size(0)
        avg_loss = total_loss / len(test_loader)
        accuracy = correct_predictions / total * 100
    return avg_loss, accuracy

Run the training/validation

In [None]:
def main():
    seed = 42
    set_seed(seed)

    data_path = r'20bn-jester-v1'
    labels_path = 'jester-v1-labels.csv'

    train_data, val_data, test_data = split_data(labels_path)
    print(f"Train length: {len(train_data)}")
    print(f"Validation length: {len(val_data)}")
    print(f"Test length: {len(test_data)}")

    train_dataset = JesterDataset(train_data, data_path)
    val_dataset = JesterDataset(val_data, data_path)
    test_dataset = JesterDataset(test_data, data_path)

    batch_size = 4
    num_workers = 16
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    print("Made the Dataloaders")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = mc3_18(weights=MC3_18_Weights.DEFAULT)

    
    output_size = len(set(train_data[1]))
    # change the output size of the pretrained model to match the number of classes in my dataset
    model.fc = torch.nn.Linear(model.fc.in_features, output_size)

    # freeze all layers
    for param in model.parameters():
        param.requires_grad = False

    # unfreeze the final fc layer
    for param in model.fc.parameters():
        param.requires_grad = True

    model = model.to(device)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-3)

    num_epochs = 20

    train(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)
    test(model, test_loader, criterion, device)

if __name__ == '__main__':
    main()

Train length: 1067
Validation length: 119
Test length: 148
Made the Dataloaders


                                                                        

Epoch: 1, Validation Loss: 3.2721017440160116 with accuracy: 9.243697478991598


                                                                        

Epoch: 2, Validation Loss: 3.2978420575459797 with accuracy: 10.92436974789916


                                                                        

In [18]:
seed = 42
set_seed(seed)

data_path = r'20bn-jester-v1'
labels_path = 'jester-v1-labels.csv'

label_encoding = create_label_encoding(labels_path)
output_size = len(label_encoding)

test_data = pd.read_csv('jester-v1-validation.csv', sep=';', header=None)
test_data = test_data.sample(frac=0.01, random_state=42)
test_data[1] = test_data[1].map(label_encoding)

test_dataset = JesterDataset(test_data, data_path)
batch_size = 4
num_workers = 10
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = mc3_18(weights=None)
model.fc = torch.nn.Linear(model.fc.in_features, output_size)
model = model.to(device)

model.load_state_dict(torch.load('model_10.pth', map_location=device))

criterion = torch.nn.CrossEntropyLoss()

test_loss, accuracy = test(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.2f}%')



Test Loss: 1.8303, Test Accuracy: 52.03%
