In [1]:
import os
import json
import random

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_video
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from bayesian_torch.models.dnn_to_bnn import get_kl_loss

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    UniformTemporalSubsample,
    Div255,
    Normalize,
    # ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'
    # If you see this error, you can install the latest version of torchvision from source.
    # pip install torchvision
    # pip install "git+https://github.com/facebookresearch/pytorchvideo.git"
)

from torchvision.transforms import Compose, Lambda, CenterCrop, Resize, RandomAffine
from tqdm import tqdm

from utils.dataset import load_msasl, load_asl_citizen
from utils.optical_flow import OpticalFlowTransform

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


### Experiments Variables

In [3]:

batch_size = 1
num_workers = 0
num_frames = 20
num_epochs = 10
learning_rate = 0.01
num_classes = 100
input_type = "rgb"
frozen_layers = 5

bayesian_layers = 3
num_monte_carlo = 5 if bayesian_layers is not None else 1

### Dataset Related Functions

In [4]:
class RandomHorizontalFlip(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p

    def forward(self, x):
        if random.random() < self.p:
            return x.flip(3)
        return x

# A helper dataset wrapper that applies a transform to each sample.
class TransformDataset(Dataset):
    def __init__(self, dataset, transform):
        """
        Args:
            dataset (Dataset): Original dataset returning (video, label, metadata)
            transform (callable): Transformation to apply on a sample dict.
        """
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        video, label, metadata = self.dataset[idx]
        sample = {"video": video, "label": label, "metadata": metadata}
        if self.transform is not None:
            sample = self.transform(sample)
        # Return transformed video, label, and optionally metadata
        return sample["video"], sample["label"], sample.get("metadata", {})

### Load and Transform Dataset

In [5]:
dataset_name = "msasl"

test_dataset, train_dataset, validation_dataset = load_msasl("bin", top_k_labels=num_classes)
# ASL Dataset: load_asl_citizen("ASL_Citizen", top_k_labels=num_classes)

train_transform = ApplyTransformToKey(
    key="video",
    transform=Compose([
        OpticalFlowTransform() if input_type  == "optical_flow" else Lambda(lambda x: x),
        Lambda(lambda x: x.permute(1, 0, 2, 3)),  # Convert (T,H,W,C) -> (C,T,H,W)
        UniformTemporalSubsample(num_frames),
        Resize(224),
        RandomHorizontalFlip(),
        Div255(),
        Lambda(lambda x: (x-0.5)*2.0),
    ])
)
test_transform = ApplyTransformToKey(
    key="video",
    transform=Compose([
        OpticalFlowTransform() if input_type  == "optical_flow" else Lambda(lambda x: x),
        Lambda(lambda x: x.permute(1, 0, 2, 3)),  # Convert (T,H,W,C) -> (C,T,H,W)
        UniformTemporalSubsample(num_frames),
        Resize(224),
        Div255(),
        Lambda(lambda x: (x-0.5)*2.0),
    ])
)

[TRAIN] Loaded 3012 videos with top 100 labels
[TEST] Loaded 458 videos with top 100 labels
[VALIDATION] Loaded 815 videos with top 100 labels


In [6]:
# Wrap each dataset with the transformation.
train_dataset = TransformDataset(train_dataset, train_transform)
validation_dataset = TransformDataset(validation_dataset, test_transform)
test_dataset = TransformDataset(test_dataset, test_transform)

# Create DataLoaders.
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

### Training Related Functions

In [7]:
def train_epoch(model, dataloader, criterion, optimizer, device, num_monte_carlo=10):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    tqdm_dataloader = tqdm(dataloader, desc="Training batches", leave=False)
    for videos, labels, _ in tqdm_dataloader:
        videos = videos.to(device)
        labels = labels.to(device)
        B, T, C, H, W = videos.shape
        
        optimizer.zero_grad()
        output_mc = []
        kl_loss_mc = []
        for _ in range(num_monte_carlo):
            outputs = model(videos)
            output_mc.append(outputs)
            kl_loss = get_kl_loss(model)
            kl_loss_mc.append(kl_loss)
        outputs = torch.stack(output_mc, dim=0).mean(dim=0)
        kl_loss = torch.stack(kl_loss_mc, dim=0).mean(dim=0)

        loss = criterion(outputs, labels) + kl_loss / B
        loss.backward(retain_graph=True)
        optimizer.step()

        running_loss += loss.item() * videos.size(0)
        _, preds = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()
        
        tqdm_dataloader.set_postfix(loss=loss.item(), kl_loss=kl_loss.item(), correct=correct/total)
        
    epoch_loss = running_loss / total if total > 0 else float('inf')
    epoch_acc = correct / total if total > 0 else 0
    return epoch_loss, epoch_acc

In [8]:
@torch.inference_mode()
def validate_epoch(model, dataloader, criterion, device, num_monte_carlo=10):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        tqdm_dataloader = tqdm(dataloader, desc="Validation batches", leave=False)
        for videos, labels, _ in tqdm_dataloader:
            output_mc = []
            videos = videos.to(device)
            labels = labels.to(device)
            for _ in range(num_monte_carlo):
                outputs = model(videos)
                loss = criterion(outputs, labels)
                running_loss += loss.item() * videos.size(0)
                output_mc.append(outputs)
            total += labels.size(0)
            output_mc = torch.stack(output_mc, dim=0).mean(dim=0)
            _, preds = torch.max(output_mc, 1)
            correct += (preds == labels).sum().item()
            tqdm_dataloader.set_postfix(loss=loss.item(), correct=correct/total)
            
    epoch_loss = running_loss / total if total > 0 else float('inf')
    epoch_acc = correct / total if total > 0 else 0
    return epoch_loss, epoch_acc

### Main

In [9]:
from model.I3D_bayesian import InceptionI3d

model_name = "I3D"

if input_type == "rgb":
    model = InceptionI3d(num_classes=num_classes, frozen_layers=frozen_layers, bayesian_layers=bayesian_layers)
elif input_type == "optical_flow":
    model = InceptionI3d(num_classes=num_classes, frozen_layers=frozen_layers, bayesian_layers=bayesian_layers, in_channels=2, input_type="optical_flow")
else:
    raise ValueError(f"Invalid input type for I3D: {input_type}")

Loaded pre-trained I3D weights from /opt/CS5340_Project/temp/rgb_imagenet.pt
Convert layer MaxPool3d_5a_2x2 into Bayesian Layer
Convert layer Mixed_5b into Bayesian Layer
Convert layer Mixed_5c into Bayesian Layer


In [10]:
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

model.train()

InceptionI3d(
  (avg_pool): AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1), padding=0)
  (logits): Unit3D(
    (conv3d): Conv3d(1024, 100, kernel_size=(1, 1, 1), stride=(1, 1, 1))
  )
  (Conv3d_1a_7x7): Unit3D(
    (conv3d): Conv3d(3, 64, kernel_size=(7, 7, 7), stride=(2, 2, 2), bias=False)
    (bn): BatchNorm3d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
  )
  (MaxPool3d_2a_3x3): MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (Conv3d_2b_1x1): Unit3D(
    (conv3d): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (bn): BatchNorm3d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
  )
  (Conv3d_2c_3x3): Unit3D(
    (conv3d): Conv3d(64, 192, kernel_size=(3, 3, 3), stride=(1, 1, 1), bias=False)
    (bn): BatchNorm3d(192, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
  )
  (MaxPool3d_3a_3x3): MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1,

In [11]:
metrics = []
best_val_acc = 0.0
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, num_monte_carlo)
    val_loss, val_acc = validate_epoch(model, val_loader, criterion, device, num_monte_carlo)
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Save the best model based on validation accuracy.
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), f"best_{model_name}_fl{frozen_layers}_bl{bayesian_layers}_{dataset_name}")
        print(f"New best model saved with Val Acc: {best_val_acc:.4f}")
    
    metrics.append({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "best_val_acc": best_val_acc})

                                                                                                                

Epoch 1/10: Train Loss: 13.8163, Train Acc: 0.0110 | Val Loss: 23.4185, Val Acc: 0.0074
New best model saved with Val Acc: 0.0074


                                                                                                                

Epoch 2/10: Train Loss: 4.8215, Train Acc: 0.0126 | Val Loss: 23.1529, Val Acc: 0.0110
New best model saved with Val Acc: 0.0110


                                                                                                                

Epoch 3/10: Train Loss: 4.7296, Train Acc: 0.0113 | Val Loss: 23.0630, Val Acc: 0.0074


                                                                                                                 

Epoch 4/10: Train Loss: 4.6586, Train Acc: 0.0106 | Val Loss: 23.2253, Val Acc: 0.0184
New best model saved with Val Acc: 0.0184


                                                                                                                 

Epoch 5/10: Train Loss: 4.6477, Train Acc: 0.0139 | Val Loss: 23.1070, Val Acc: 0.0061


                                                                                                                 

Epoch 6/10: Train Loss: 4.6480, Train Acc: 0.0100 | Val Loss: 23.1063, Val Acc: 0.0086


                                                                                                                 

Epoch 7/10: Train Loss: 4.6561, Train Acc: 0.0093 | Val Loss: 23.0462, Val Acc: 0.0135


                                                                                                                 

Epoch 8/10: Train Loss: 4.6469, Train Acc: 0.0106 | Val Loss: 23.0543, Val Acc: 0.0086


                                                                                                                

Epoch 9/10: Train Loss: 4.6470, Train Acc: 0.0139 | Val Loss: 23.0811, Val Acc: 0.0074


                                                                                                                

Epoch 10/10: Train Loss: 4.6452, Train Acc: 0.0116 | Val Loss: 23.0825, Val Acc: 0.0061


