In [None]:
%mkdir dataset
%cd dataset
!gdown 1N93rb_uFqKRZ9naX8CXShFt5RJHOmjZH
!unzip -q rwf-2000.zip

In [None]:
%cd ..

# Import libraries

In [None]:
import os
import time

import torch
import torch.nn as nn
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from tqdm import tqdm
from transformers import VivitConfig, VivitForVideoClassification

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

warnings.filterwarnings("ignore")

In [None]:
import logging

logging.getLogger().setLevel(logging.INFO)
logging.basicConfig(format="%(message)s", level=logging.INFO)
LOGGER = logging.getLogger("Torch-Cls")
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Model

In [None]:
class Model(nn.Module):
    def __init__(self, num_classes=2, image_size=224, num_frames=15):
        super(Model, self).__init__()
        cfg = VivitConfig()
        cfg.num_classes = num_classes
        cfg.image_size = image_size
        cfg.num_frames = num_frames

        self.vivit = VivitForVideoClassification.from_pretrained(
            "google/vivit-b-16x2-kinetics400",
            config=cfg,
            ignore_mismatched_sizes=True,
        )

    def forward(self, x_3d):
        # (bs, C, T, H, W) -> (bs, T, C, H, W)
        x_3d = x_3d.permute(0, 2, 1, 3, 4)

        out = self.vivit(x_3d)

        return out.logits

In [None]:
# Example of how to use the model
model = Model(num_classes=2, num_frames=15)

# Check param
param = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model has {param} parameters")

# Test the model with a random input (batch_size, channels, frames, height, width)
inputs = torch.rand(1, 3, 15, 224, 224)

output = model(inputs)

print(output.shape)

Some weights of VivitForVideoClassification were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized because the shapes did not match:
- vivit.embeddings.position_embeddings: found shape torch.Size([1, 3137, 768]) in the checkpoint and torch.Size([1, 1373, 768]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model has 87293186 parameters
torch.Size([1, 2])


In [None]:
del model, inputs, output

# Load dataset

In [None]:
class VideoDataset(Dataset):
    def __init__(self, root_dir, phase="train", transform=None, n_frames=None):
        """
        Args:
            root_dir (string): Directory with all the videos (each video as a subdirectory of frames).
            transform (callable, optional): Optional transform to be applied on a sample.
            n_frames (int, optional): Number of frames to sample from each video, uniformly. If None, use all frames.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.n_frames = n_frames
        self.phase = phase
        self.videos, self.labels = self._load_videos()

    def _load_videos(self):
        videos, labels = [], []
        class_id = 0

        video_folders = os.listdir(os.path.join(self.root_dir, self.phase))

        for folder in video_folders:
            video_paths = os.listdir(os.path.join(self.root_dir, self.phase, folder))

            for video_path in video_paths:
                video_folder = os.path.join(
                    self.root_dir, self.phase, folder, video_path
                )
                frames = sorted(
                    (os.path.join(video_folder, f) for f in os.listdir(video_folder)),
                    key=lambda f: int(
                        "".join(filter(str.isdigit, os.path.basename(f)))
                    ),
                )

                if self.n_frames:
                    frames = self._uniform_sample(frames, self.n_frames)

                videos.append(frames)
                labels.append(class_id)

            class_id += 1

        return videos, labels

    def _uniform_sample(self, frames, n_frames):
        """
        Helper method to uniformly sample n_frames from the frames list.
        """
        stride = max(1, len(frames) // n_frames)
        sampled = [frames[i] for i in range(0, len(frames), stride)]
        return sampled[:n_frames]

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        video_frames = self.videos[idx]
        label = self.labels[idx]
        images = []
        for frame_path in video_frames:
            image = Image.open(frame_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
            images.append(image)

        # Stack images along new dimension (sequence length)
        data = torch.stack(images, dim=0)

        # Rearrange to have the shape (C, T, H, W)
        data = data.permute(1, 0, 2, 3)
        return data, label

In [None]:
BATCH_SIZE = 2
MAX_LEN = 15
IMAGE_SIZE = 224


transform = transforms.Compose(
    [
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.ToTensor(),
    ]
)

# Load dataset
train_dataset = VideoDataset(
    root_dir="./dataset/rwf-2000", phase="train", transform=transform, n_frames=MAX_LEN
)

val_dataset = VideoDataset(
    root_dir="./dataset/rwf-2000", phase="val", transform=transform, n_frames=MAX_LEN
)

# Count number of cpus
cpus = os.cpu_count()
print(f"Number of cpus: {cpus}")

# Create data loaders
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, num_workers=cpus, shuffle=True
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, num_workers=cpus, shuffle=False
)

# test
for data, label in train_loader:
    print(data.shape, label)
    break

Number of cpus: 80
torch.Size([2, 3, 15, 224, 224]) tensor([0, 0])


# Train model

In [None]:
def colorstr(*input):
    *args, string = input if len(input) > 1 else ("blue", "bold", input[0])
    colors = {
        "black": "\033[30m",  # basic colors
        "red": "\033[31m",
        "green": "\033[32m",
        "yellow": "\033[33m",
        "blue": "\033[34m",
        "magenta": "\033[35m",
        "cyan": "\033[36m",
        "white": "\033[37m",
        "bright_black": "\033[90m",  # bright colors
        "bright_red": "\033[91m",
        "bright_green": "\033[92m",
        "bright_yellow": "\033[93m",
        "bright_blue": "\033[94m",
        "bright_magenta": "\033[95m",
        "bright_cyan": "\033[96m",
        "bright_white": "\033[97m",
        "end": "\033[0m",  # misc
        "bold": "\033[1m",
        "underline": "\033[4m",
    }
    return "".join(colors[x] for x in args) + f"{string}" + colors["end"]

In [None]:
def train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs=25, device="cuda"
):
    """
    Function to train the model.

    Parameters:
    - model: The neural network model to train.
    - train_loader: DataLoader for the training set.
    - val_loader: DataLoader for the validation set.
    - criterion: The loss function.
    - optimizer: The optimization algorithm.
    - num_epochs: Number of epochs to train for.
    - device: The device to run the training on, 'cuda' or 'cpu'.

    Returns:
    - model: The trained model.
    """
    since = time.time()

    history = {
        "train_loss": [],
        "train_acc": [],
        "val_loss": [],
        "val_acc": [],
        "lr": [],
    }
    best_val_acc = 0.0

    # Send the model to the specified device
    model.to(device)

    # Loop over the dataset multiple times
    for epoch in range(num_epochs):
        LOGGER.info(colorstr(f"Epoch {epoch}/{num_epochs-1}:"))

        # Each epoch has a training and validation phase
        for phase in ["train", "val"]:
            if phase == "train":
                LOGGER.info(
                    colorstr("bright_yellow", "bold", "\n%20s" + "%15s" * 3)
                    % ("Training:", "gpu_mem", "loss", "acc")
                )
                model.train()
            else:
                LOGGER.info(
                    colorstr("bright_green", "bold", "\n%20s" + "%15s" * 3)
                    % ("Validation:", "gpu_mem", "loss", "acc")
                )
                model.eval()

            running_items = 0
            running_loss = 0.0
            running_corrects = 0

            # Use the appropriate data loader
            data_loader = train_loader if phase == "train" else val_loader

            _phase = tqdm(
                data_loader,
                total=len(data_loader),
                bar_format="{desc} {percentage:>7.0f}%|{bar:10}{r_bar}{bar:-10b}",
                unit="batch",
            )

            # Iterate over data.
            for inputs, labels in _phase:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history only in train
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_items += outputs.size(0)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                epoch_loss = running_loss / running_items
                epoch_acc = running_corrects / running_items

                mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}GB"
                desc = ("%35s" + "%15.6g" * 2) % (
                    mem,
                    epoch_loss,
                    epoch_acc,
                )
                _phase.set_description_str(desc)

            if phase == "train":
                history["train_loss"].append(epoch_loss)
                history["train_acc"].append(epoch_acc.item())
            else:
                history["val_loss"].append(epoch_loss)
                history["val_acc"].append(epoch_acc.item())
                if epoch_acc > best_val_acc:
                    best_val_acc = epoch_acc
                    history["best_epoch"] = epoch

                print(f"Best val Acc: {best_val_acc:4f}")

    time_elapsed = time.time() - since
    history["INFO"] = (
        "Training complete in {:.0f}h {:.0f}m {:.0f}s with {} epochs - Best val Acc: {:4f}".format(
            time_elapsed // 3600,
            time_elapsed % 3600 // 60,
            time_elapsed % 60,
            num_epochs,
            best_val_acc,
        )
    )

    return model

In [None]:
# Example usage (assuming you have defined your criterion and optimizer):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Model(num_classes=2, num_frames=15)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

trained_model = train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs=20, device=device
)

Some weights of VivitForVideoClassification were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized because the shapes did not match:
- vivit.embeddings.position_embeddings: found shape torch.Size([1, 3137, 768]) in the checkpoint and torch.Size([1, 1373, 768]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mEpoch 0/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m
                             8.23GB       0.493009        0.77625     100%|██████████| 800/800 [03:48<00:00,  3.50batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
   

Best val Acc: 0.785000


                             8.23GB       0.284058         0.8875     100%|██████████| 800/800 [03:50<00:00,  3.46batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.429285          0.815     100%|██████████| 200/200 [00:26<00:00,  7.45batch/s]
[34m[1mEpoch 2/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.815000


                             8.23GB        0.11948           0.96     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.516186          0.825     100%|██████████| 200/200 [00:26<00:00,  7.51batch/s]
[34m[1mEpoch 3/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0669346       0.975625     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB        0.64795            0.8     100%|██████████| 200/200 [00:26<00:00,  7.52batch/s]
[34m[1mEpoch 4/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0407663         0.9875     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.736764         0.7975     100%|██████████| 200/200 [00:26<00:00,  7.44batch/s]
[34m[1mEpoch 5/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0392338         0.9875     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.761056         0.7875     100%|██████████| 200/200 [00:26<00:00,  7.45batch/s]
[34m[1mEpoch 6/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0455792       0.985625     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.959702            0.8     100%|██████████| 200/200 [00:26<00:00,  7.50batch/s]
[34m[1mEpoch 7/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0329469        0.98875     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.788719         0.7975     100%|██████████| 200/200 [00:27<00:00,  7.40batch/s]
[34m[1mEpoch 8/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB     0.00339674              1     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.817275            0.8     100%|██████████| 200/200 [00:26<00:00,  7.44batch/s]
[34m[1mEpoch 9/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0503576        0.98375     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.648725         0.8175     100%|██████████| 200/200 [00:26<00:00,  7.41batch/s]
[34m[1mEpoch 10/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0308527           0.99     100%|██████████| 800/800 [03:50<00:00,  3.48batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.767059         0.7925     100%|██████████| 200/200 [00:26<00:00,  7.51batch/s]
[34m[1mEpoch 11/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0384502       0.985625     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.796227          0.805     100%|██████████| 200/200 [00:26<00:00,  7.41batch/s]
[34m[1mEpoch 12/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0174867         0.9925     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.973774          0.785     100%|██████████| 200/200 [00:26<00:00,  7.43batch/s]
[34m[1mEpoch 13/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB     0.00625249       0.998125     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.833754         0.8125     100%|██████████| 200/200 [00:26<00:00,  7.47batch/s]
[34m[1mEpoch 14/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0454068       0.985625     100%|██████████| 800/800 [03:50<00:00,  3.46batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.830386         0.8075     100%|██████████| 200/200 [00:26<00:00,  7.51batch/s]
[34m[1mEpoch 15/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0167301          0.995     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.806146         0.7975     100%|██████████| 200/200 [00:26<00:00,  7.48batch/s]
[34m[1mEpoch 16/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0180276          0.995     100%|██████████| 800/800 [03:50<00:00,  3.48batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.935627            0.8     100%|██████████| 200/200 [00:26<00:00,  7.47batch/s]
[34m[1mEpoch 17/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.825000


                             8.23GB      0.0208631       0.993125     100%|██████████| 800/800 [03:49<00:00,  3.48batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.837694         0.8475     100%|██████████| 200/200 [00:26<00:00,  7.52batch/s]
[34m[1mEpoch 18/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.847500


                             8.23GB      0.0268184         0.9925     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB       0.843478            0.8     100%|██████████| 200/200 [00:26<00:00,  7.41batch/s]
[34m[1mEpoch 19/19:[0m
[93m[1m
           Training:        gpu_mem           loss            acc[0m


Best val Acc: 0.847500


                             8.23GB     0.00483894        0.99875     100%|██████████| 800/800 [03:50<00:00,  3.47batch/s]
[92m[1m
         Validation:        gpu_mem           loss            acc[0m
                             8.23GB         1.2326         0.7725     100%|██████████| 200/200 [00:27<00:00,  7.38batch/s]

Best val Acc: 0.847500



