In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

In [2]:
NUM_OF_CLASSES = 1000
BATCH_SIZE = 4

In [3]:
class VideoCNN(nn.Module):
    def __init__(self, num_classes=1):
        super(VideoCNN, self).__init__()
        
        # 3D Convolution Layers with Pooling after each Conv
        self.conv1 = nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool3d(kernel_size=2, stride=2, padding=0)
        
        self.conv2 = nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool3d(kernel_size=2, stride=2, padding=0)
        
        self.conv3 = nn.Conv3d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool3d(kernel_size=2, stride=2, padding=0)
        
        self.conv4 = nn.Conv3d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.pool4 = nn.MaxPool3d(kernel_size=2, stride=2, padding=0)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 4 * 14 * 14, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        
        # Apply Conv3D layers followed by MaxPooling
        x = self.pool1(torch.relu(self.conv1(x)))
        x = self.pool2(torch.relu(self.conv2(x)))
        x = self.pool3(torch.relu(self.conv3(x)))
        x = self.pool4(torch.relu(self.conv4(x)))
        
        # Flatten the tensor to pass it through fully connected layers
        x = x.view(x.size(0), -1)  # Flatten the tensor
        
        # Fully connected layers
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)

        # return torch.sigmoid(x) if self.fc2.out_features == 1 else torch.softmax(x, dim=1)
        return x

In [4]:
# from model.gru import GRU

model = VideoCNN(num_classes=NUM_OF_CLASSES)
# model = GRU(vocab_size=NUM_OF_CLASSES)

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Training Device: {device}')

model.to(device)

Training Device: cuda


VideoCNN(
  (conv1): Conv3d(3, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool1): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv3d(16, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool2): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool3): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool4): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=100352, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1000, bias=True)
)

In [6]:
from utils.dataset import load_msasl

label_threshold = 1000
test_dataset, train_dataset, validation_dataset = load_msasl("bin", label_threshold)

[TEST] Loaded 2842 videos with label < 1000
[TRAIN] Loaded 12839 videos with label < 1000
[VALIDATION] Loaded 3742 videos with label < 1000


In [7]:
import os
from utils.data_loader import get_video_dataloader

if __name__ == "__main__":
    data_dir = "bin"
    train_binary_file = os.path.join(data_dir, "train", "train.bin")
    train_index_file = os.path.join(data_dir, "train", "index.json")

    # train_loader = get_video_dataloader(train_binary_file, train_index_file, batch_size=BATCH_SIZE)

    # If you want to use this to show dataset, add it to the return in the dataloader
    dataset, train_loader = get_video_dataloader(train_binary_file, train_index_file, batch_size=BATCH_SIZE)

    # Iterate through the dataloader
    for videos, labels, metadata in train_loader:
        print(f"Batch of videos: {videos.shape}") # (batch_size, 64, C, H, W)
        print(f"Batch of labels: {labels.shape}") # (batch_size,)
        print(f"Metadata sample: {metadata}") # Dictionary of metadata
        break # Checking the first batch


Batch of videos: torch.Size([4, 64, 3, 224, 224])
Batch of labels: torch.Size([4])
Metadata sample: {'id': ['e4927ae7-44c1-4c5e-a6d7-6fb94410aee9', '543b0944-cedd-4523-8e8a-0503e80e45ee', '281b2abe-2fba-42af-adf1-bc9509170757', 'a8f68a1d-e306-4856-b6d6-32a35cd94b9d'], 'org_text': ['same', 'WOMAN', 'Spring', 'finish'], 'clean_text': ['same', 'woman', 'spring', 'finish'], 'signer_id': tensor([  0, 143,  12, 247]), 'signer': tensor([ 0, 13,  6, 81]), 'file': ['SAME(2)', 'WOMAN(1)', 'Mastering ASL Unit 3 Vocabulary signed by Dr Wooten', 'Basic Sign Language for Caregivers (bath time)'], 'label': tensor([70, 93, 23, 12]), 'fps': tensor([25, 25, 25, 25]), 'url': ['www.youtube.com/watch?v=cnX45VWbjmc', 'https://www.youtube.com/watch?v=i47aN_PUDvk', 'https://www.youtube.com/watch?v=K8c-np9zNT8', 'https://www.youtube.com/watch?v=IMF7K2ClfQc'], 'text': ['same', 'woman', 'spring', 'finish'], 'filename': ['same_e4927ae7-44c1-4c5e-a6d7-6fb94410aee9', 'woman_543b0944-cedd-4523-8e8a-0503e80e45ee', 's

In [None]:
# dataset.show_video(0)

In [8]:
model

VideoCNN(
  (conv1): Conv3d(3, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool1): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv3d(16, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool2): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool3): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (pool4): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=100352, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1000, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epoch = 5

for epoch in range(num_epoch):
    for videos, labels, metadata in train_loader:
        videos, labels = videos.to(device), labels.to(device)

        optimizer.zero_grad()

        # videos = torch.tensor(videos, dtype=torch.uint8).permute(0, 2, 1, 3, 4)
        videos = videos.float().permute(0, 2, 1, 3, 4) / 255

        videos.requires_grad_()

        # print((videos))
        # print(labels) 
        # print(metadata)

        # outputs = model(videos/255).squeeze()
        outputs = model(videos)

        loss = criterion(outputs, labels.long())

        # print("loss: ", loss)

        loss.backward()

        optimizer.step()


    print(f"Epoch [{epoch+1}/{num_epoch}], Loss: {loss.item():.4f}")

Epoch [1/5], Loss: 4.7924
Epoch [2/5], Loss: 4.6349
Epoch [3/5], Loss: 4.6430
Epoch [4/5], Loss: 4.4729
Epoch [5/5], Loss: 4.6278


In [None]:
# VideoCNN
# Epoch [1/5], Loss: 6.9095
# Epoch [2/5], Loss: 6.9095
# Epoch [3/5], Loss: 6.9095
# Epoch [4/5], Loss: 6.9095
# Epoch [5/5], Loss: 6.9095
# around 4 mins per epoch
# P.S. after using relu, the loss actually shaking but i forget to take a photo and acidentally re-run -_-
# Answer: because for categorical, the batch size should be larger than one

# GRU
# Pretty much the same'
# Epoch [1/5], Loss: 6.9095
# Epoch [2/5], Loss: 6.9095
# When using softmax, the lost is always constant

## Code 2

In [20]:
import multiprocessing
print(multiprocessing.cpu_count())

16


In [21]:
import numpy as np
import torch
import torchvision.transforms as T
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from accelerate import Accelerator

from utils.dataset import load_msasl

In [22]:
batch_size = 1
num_workers = 2
num_frames = 8

num_epochs = 5
learning_rate = 0.001
num_classes = 10
test_dataset, train_dataset, validation_dataset = load_msasl("bin", num_classes)

[TEST] Loaded 81 videos with label < 10
[TRAIN] Loaded 337 videos with label < 10
[VALIDATION] Loaded 111 videos with label < 10


In [23]:
accelerator = Accelerator()

In [24]:
# A helper dataset wrapper that applies a transform to each sample.
class TransformDataset(Dataset):
    def __init__(self, dataset, transform):
        """
        Args:
            dataset (Dataset): Original dataset returning (video, label, metadata)
            transform (callable): Transformation to apply on a sample dict.
        """
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        video, label, metadata = self.dataset[idx]
        sample = {"video": video, "label": label, "metadata": metadata}
        if self.transform is not None:
            sample = self.transform(sample)
        # Return transformed video, label, and optionally metadata
        return sample["video"], sample["label"], sample.get("metadata", {})

In [25]:
def video_transform(sample, num_frames):
    
    video = sample["video"]
    # print(f"Original video shape: {video.shape}")

    # Step 1: Converting from (T, C, H, W) to (C, T, H, W)
    video = video.permute(1, 0, 2, 3)
    # print(f"After permute: {video.shape}")
    
    # Step 2: Slice the video to the desired frame
    total_frames = video.size(1)
    if total_frames > num_frames:
        frame_indices = np.linspace(0, total_frames - 1, num_frames).astype(int)
        video = video[:, frame_indices]
    else:
        video = video[:, :num_frames]

    # print(f"After slicing/padding: {video.shape}")

    # Step 3: Divide by 255 to normalize value to [0,1]
    video = video / 255.0

    # Step 4: Center Crop (224x224)
    crop_transform = T.CenterCrop(224)
    video = torch.stack([crop_transform(frame) for frame in video]) # Crop each frame

    # print(f"After cropping: {video.shape}")
    
    # Step 5: Normalize with mean=[0.45, 0.45, 0.45] and std=[0.225, 0.225, 0.225]
    # normalize_transform = T.Normalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225])
    # for i, frame in enumerate(video):
    #     if i == 1:
    #         video = torch.stack([normalize_transform(frame)])
    #         print("oit")
    #     else:
    #         video = torch.stack([frame])

    # video = torch.stack([normalize_transform(frame) for frame in video])
    
    return {"video": video, "label": sample["label"], "metadata": sample["metadata"]}

In [26]:
transform = lambda sample: video_transform(sample, num_frames=num_frames)
transformed_dataset = TransformDataset(train_dataset, transform)
dataloader = DataLoader(transformed_dataset, batch_size=batch_size, shuffle=True)

print(type(dataloader))

# for videos, labels, metadata in dataloader:
#     print(f"Batch of videos shape: {videos.shape}")
#     print(f"Batch of labels: {labels}")

<class 'torch.utils.data.dataloader.DataLoader'>


In [27]:
train_dataset = TransformDataset(dataset=train_dataset, transform=lambda x: video_transform(x, num_frames))
validation_dataset = TransformDataset(dataset=validation_dataset, transform=lambda x: video_transform(x, num_frames))
test_dataset = TransformDataset(dataset=test_dataset, transform=lambda x: video_transform(x, num_frames))

# train_dataset = TransformDataset(dataset=train_dataset, transform=lambda x: {"video": video_transform(x["video"], num_frames), "label": x["label"], "metadata": x["metadata"]})
# validation_dataset = TransformDataset(dataset=validation_dataset, transform=lambda x: {"video": video_transform(x["video"], num_frames), "label": x["label"], "metadata": x["metadata"]})
# test_dataset = TransformDataset(dataset=test_dataset, transform=lambda x: {"video": video_transform(x["video"], num_frames), "label": x["label"], "metadata": x["metadata"]})

In [28]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [29]:
print(f"Training data size: {len(train_loader.dataset)}")

# for i, (video, label, metadata) in enumerate(train_loader.dataset):
#     print(f"Video shape: {video.shape}")
#     print(f"Label: {label}")
#     print(f"Metadata: {metadata}")
#     if i == 2:  # Limiting to 5 samples to check
#         break
#     print(video)

    

Training data size: 337


In [30]:
# import time
# for videos, labels, metadata in tqdm(train_loader, desc="Training batches", leave=False):
#     print("Batch loaded!")
#     sample_video = videos  # Get the first video
#     print(f"Video shape before transformation: {sample_video.shape}")

#     start_time = time.time()
#     video_transform(sample_video, num_frames)
#     print("Transformation took", time.time() - start_time, "seconds")


In [39]:
device = accelerator.device
print("Using device:", device)

Using device: cuda


In [40]:
from model.gru import GRU
model = GRU(vocab_size=num_classes, frame_count=num_frames, input_channels=3, image_size=224, hidden_dim=128)
model = model.to(device)

In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [42]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    
    print("A")
    model.train()
    
    print("B")
    running_loss = 0.0
    correct = 0
    total = 0

    print("C")
    for videos, labels, _ in tqdm(dataloader.dataset, desc="Training batches", leave=False):

        print("D")
        # print(device)
        # print(type(videos))
        # print(type(labels))
        videos = videos.float().to(device)
        labels = torch.Tensor(labels).to(device)

        # print(type(videos))
        # print(type(labels))
        # print(videos)
        # print(labels)

        optimizer.zero_grad()

        outputs = model(videos)

        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        running_loss += loss.item() * videos.size(0)

        _, preds = torch.max(outputs, 1)

        total += labels.size(0)

        correct += (preds == labels).sum().item()

    epoch_loss = running_loss / total if total > 0 else float('inf')
    epoch_acc = correct / total if total > 0 else 0

    return epoch_loss, epoch_acc
    

In [43]:
def validate_epoch(model, dataloader, criterion, device):
    
    model.eval()
    
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for videos, labels, _ in tqdm(dataloader.dataset, desc="Validation batches", leave=False):

            # videos, labels = videos.to(device), labels.to(device)

            print((videos))
            print(labels) 
            print(metadata)

            outputs = model(videos)

            loss = criterion(outputs, labels)

            running_loss += loss.item() * videos.size(0)

            _, preds = torch.max(outputs, 1)

            total += labels.size(0)

            correct += (preds == labels).sum().item()

    epoch_loss = running_loss / total if total > 0 else float('inf')
    epoch_acc = correct / total if total > 0 else 0

    return epoch_loss, epoch_acc
    

In [44]:
# Prepare with accelerator.
model, optimizer, train_loader, val_loader = accelerator.prepare(model, optimizer, train_loader, val_loader)

# print(f"model: {model}")
# print(f"optimizer: {optimizer}")
# print(f"train_loader: {train_loader}")
# print(f"val_loader: {val_loader}")

# for epoch in tqdm(range(num_epochs), desc="Epochs"):
#     train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
#     val_loss, val_acc = validate_epoch(model, val_loader, criterion, device)

#     accelerator.print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# accelerator.wait_for_everyone()

# model_name = "video_classification_model_conv_GRU.pth"

# torch.save(model.state_dict(), model_name)
# accelerator.print("Training complete. Model saved as ", model_name)

In [45]:
# DEBUG
for i, (video, label, metadata) in enumerate(tqdm(train_loader.dataset, desc="Checking dataset", leave=True)):
    # print(f"Video shape: {video.shape}")
    # print(f"Label: {label}")
    # print(f"Metadata: {metadata}")
    # print(video)
    pass

Checking dataset: 100%|██████████| 337/337 [00:07<00:00, 42.28it/s]


In [46]:
train_epoch(model, train_loader, criterion, optimizer, device)

A
B
C


                                                         

D
size: torch.Size([128, 8, 28, 28])


RuntimeError: shape '[128, 28, 6272]' is invalid for input of size 802816

In [None]:


for epoch in tqdm(range(num_epochs), desc="Epochs"):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate_epoch(model, val_loader, criterion, device)

    accelerator.print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

accelerator.wait_for_everyone()

model_name = "video_classification_model_conv_GRU.pth"

torch.save(model.state_dict(), model_name)
accelerator.print("Training complete. Model saved as ", model_name)

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]