# Training Notebook

we first need to build torchvision from source as it allows us to use ffmpeg and video_reader backend

In [None]:
import distutils
distutils.spawn.find_executable('ffmpeg')

In [None]:
!apt install libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libswscale-dev libavresample-dev libavfilter-dev -y
!pip uninstall torchvision -y
!rm -r vision/
!git clone https://github.com/pytorch/vision.git
%cd vision
!python setup.py install
%cd /kaggle/working

## Restart but not reset

In [None]:
!pip show torchvision

In [8]:
import torchvision
import torch
from torch import nn, Tensor
from torchvision import models
import math
import gc
import random


random.seed(0)
torch.manual_seed(0)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
torch.cuda.empty_cache()
torchvision.set_video_backend('video_reader')

## Modelling Pipeline

In [46]:
class PatchEmbedding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super(PatchEmbedding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, n_frames, embedding_dim]
        """
        b, _, _ = x.shape
        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class FeatureExtractor(nn.Module):
    def __init__(self, d_model: int, model_name: str, model_weights: str = 'DEFAULT'):
        super(FeatureExtractor, self).__init__()
        assert model_name in ['efficientnet_v2_s', 'efficientnet_v2_m', 'efficientnet_v2_l', 'inception_v3', 'wide_resnet50_2', 'wide_resnet101_2']
        self.model = getattr(models, model_name)(weights=model_weights)
        
        if model_name in ['efficientnet_v2_s', 'efficientnet_v2_m', 'efficientnet_v2_l']:
            self.model.classifier = nn.Linear(in_features=self.model.classifier[1].in_features, out_features = d_model)
        else:
            self.model.fc = nn.Linear(in_features=self.model.fc.in_features ,out_features=d_model)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, n_frames, channels, height, width]
        """
        b, f, _, _, _ = x.shape
        x = x.view(b*f, *x.size()[2:])
        x = self.model(x)
        x = x.view(b, f, *x.size()[1:])

        return x


class ConvAcTransformer(nn.Module):
    def __init__(self, d_model: int,
                 attention_heads: int,
                 num_layers: int,
                 num_classes: int,
                 feature_extractor_name: str):
        super(ConvAcTransformer, self).__init__()
        self.d_model = d_model
        self.feature_extractor_name = feature_extractor_name
        self.attention_heads = attention_heads
        self.num_classes = num_classes
        self.num_layers = num_layers

        
        self.feature_extract = FeatureExtractor(self.d_model, self.feature_extractor_name, model_weights=None)
        self.patch_embed = PatchEmbedding(self.d_model)

        transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model,
                                                                nhead=self.attention_heads,
                                                                norm_first=True,
                                                                activation='gelu')
        self.transformer_encoder = nn.TransformerEncoder(transformer_encoder_layer,
                                                         self.num_layers,
                                                         norm=nn.LayerNorm(self.d_model))

        self.classification_head = nn.Linear(self.d_model, self.num_classes)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, n_frames, channels, height, width]
        """
        # extract features from all frames
        x = self.feature_extract(x)

        # apply patch embedding from ViT
        x = self.patch_embed(x)

        # ViT encoder
        x = self.transformer_encoder(x)

        # select first token/classifier token
        x = x[:, 0, :]

        # classification head
        x = self.classification_head(x)

        return x


# test
test_tensor = torch.randn(16, 15, 3, 128, 128, dtype=torch.float32).to(DEVICE)
model = ConvAcTransformer(d_model=256, attention_heads=2, num_layers=2, num_classes=101, feature_extractor_name='efficientnet_v2_s')
model = model.to(device=DEVICE)
out = model(test_tensor)
print(out.size())
diff = out.mean().backward()
print("done")
del test_tensor, out, diff
gc.collect()

## Data Pipeline

#### Data download using next cell: (comment if it's already there)

In [None]:
!apt install unrar 
!wget https://www.crcv.ucf.edu/data/UCF101/UCF101.rar --no-check-certificate
!wget https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip --no-check-certificate
!unrar  x UCF101.rar -idq
!unzip UCF101TrainTestSplits-RecognitionTask.zip
! rm UCF101.rar UCF101TrainTestSplits-RecognitionTask.zip

In [9]:
from torchvision.datasets import UCF101
from torchvision import transforms

transforms = transforms.Compose([
    transforms.Lambda(lambda x: x.permute(0, 3, 1, 2)),
    transforms.Lambda(lambda x: x[::2]), # skip second frame
    transforms.RandomHorizontalFlip(p=0.5),
    # transforms.RandomResizedCrop(256, scale=(0.8, 1.0)),
    # transforms.CenterCrop(60),
    # transforms.RandomRotation(degrees=10, interpolation=transforms.InterpolationMode.BILINEAR),
    # transforms.GrayScale(),
    # transforms.GaussianBlur(kernel_size=3),
    # transforms.ColorJitter(brightness=.2, hue=.1),
    # transforms.RandomPerspective(distortion_scale=0.1),
    # transforms.AugMix(),
    # transforms.RandAugment(),

    transforms.Lambda(lambda x: x / 255.),
    transforms.Lambda(lambda x: x.float()),
])

train_dataset = UCF101(root = './UCF-101/', annotation_path = './ucfTrainTestlist/', transform=transforms ,_video_width=128, _video_height=128, train=True, frames_per_clip=30)
# valtest_dataset = UCF101(root = './UCF-101/', annotation_path = './ucfTrainTestlist/', transform=transforms ,_video_width=128, _video_height=128, train=False, frames_per_clip=30)

In [10]:
# del model, x, y, loss
# gc.collect()

In [40]:

indices = random.sample(list(range(len(train_dataset))), len(train_dataset)//30)

# Warp into Subsets
train_subset = torch.utils.data.Subset(train_dataset, indices[:-2000])
test_subset = torch.utils.data.Subset(train_dataset, indices[-1000:])
val_subset = torch.utils.data.Subset(train_dataset, indices[-2000:-1000])

In [41]:
def custom_collate(batch):
    # skip audio data
    filtered_batch = []
    for video, _, label in batch:
        filtered_batch.append((video, label))
    return torch.utils.data.dataloader.default_collate(filtered_batch)

BATCH_SIZE = 16
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True,
                                           num_workers=2, pin_memory=True,
                                           collate_fn=custom_collate)
val_loader = torch.utils.data.DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=True,
                                           num_workers=2, pin_memory=True,
                                           collate_fn=custom_collate)
test_loader = torch.utils.data.DataLoader(test_subset, batch_size=BATCH_SIZE, shuffle=True,
                                           num_workers=2,
                                           collate_fn=custom_collate)

print(f"Total number of train samples: {len(train_subset)}")
print(f"Total number of test samples: {len(test_subset)}")
print(f"Total number of val samples: {len(val_subset)}")
print(f"Total number of (train) batches: {len(train_loader)}")
print(f"Total number of (test) batches: {len(test_loader)}")
print(f"Total number of (val) batches: {len(val_loader)}")

### Visualization

In [42]:
!pip install imageio-ffmpeg
import imageio
from IPython.display import Video
import numpy as np

for i, (fm,lb) in enumerate(train_loader):
    print("Dataset batch shape", fm.size(), fm.dtype, lb.size(),  lb.dtype )
    # break
    video = fm[0].permute(0, 2, 3, 1).numpy() 
    # print(video.min(), video.max(), video.mean(), video.std())
    video = (video * 255.0).astype(np.uint8)
    label = train_dataset.classes[lb[0]]
    
    print("Example:", label)
    imageio.mimwrite('./test.mp4', video, fps=30)
    break

Video('./test.mp4', width=256, height=256, embed=True)

## Training Pipeline

In [43]:
torch.cuda.empty_cache()
!nvidia-smi

In [None]:
from torch import optim
from tqdm import tqdm
import copy

# Defining model and training options
model = model.to(device=DEVICE)

N_EPOCHS = 100
LR = 0.1
MODEL_PATH = 'best_model_ucf101_x128_sub_30'
best_val_acc = 0
best_model = None
pretrain = False

# Training loop
optimizer = optim.AdamW(model.parameters(), lr=LR) #, weight_decay=0.1)
criterion = nn.CrossEntropyLoss(reduction='mean')

if pretrain:
    checkpoint = torch.load(f"./{MODEL_PATH}.pt")
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

for epoch in tqdm(range(N_EPOCHS), desc="Total Training Done:", leave=False):
        # TRAINING
        train_loss = 0.0
        count = 0
        model.train()
        t_loader = iter(train_loader)
        t_correct, t_total = 0, 0
        for batch in tqdm(range(len(train_loader)), desc=f"Epoch {epoch + 1}", position=0):
            x, y = next(t_loader)
            x, y = x.to(DEVICE), y.to(DEVICE)
            y_hat = model(x)
            loss = criterion(y_hat, y) #/ len(x)

            train_loss += loss.detach().cpu().item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            t_correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
            t_total += len(x)
            
            count+=1
            #if count>3000:
            #    break
        train_acc = t_correct / t_total * 100
        
        # TODO: LR DECAY
        
        # VALIDATION
        val_loss = 0.0
        v_correct, v_total = 0, 0
        model.eval()
        with torch.no_grad():
            v_loader = iter(val_loader)
            for batch in tqdm(range(len(val_loader)), desc="Validating", position=0):
                x, y = next(v_loader)
                x, y = x.to(DEVICE), y.to(DEVICE)
                y_hat = model(x)
                loss = criterion(y_hat, y) #/ len(x)
                val_loss += loss.detach().cpu().item()

                v_correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
                v_total += len(x)
            
            val_acc = v_correct / v_total * 100
            if(best_val_acc < val_acc):
                best_model = copy.deepcopy(model)
                
                best_val_acc = val_acc
                print(f"Best accuracy: {best_val_acc}")
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': best_model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()
                }, f"./{MODEL_PATH}.pt")
                
                best_model = best_model.cpu()
                torch.cuda.empty_cache()
                
        print(f"Epoch {epoch + 1}/{N_EPOCHS} train loss: {train_loss:.2f}, train acc {train_acc:.2f} val loss: {val_loss:.2f}")
        print(f"Val accuracy: {val_acc:.2f}%")
        



In [45]:
del best_model, model, x, y, y_hat, loss, v_correct
gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

In [None]:
# val_acc, val_loss, train_loss

In [None]:
# import gc
# # del model
# del best_model
# gc.collect()


In [None]:
# Test loop on latest model
# torch.cuda.empty_cache()
correct, total = 0, 0
test_loss = 0.0
model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing", position=0, leave=True):
        x, y = batch
        x, y = x.to(DEVICE), y.to(DEVICE)
        y_hat = model(x)
        loss = criterion(y_hat, y) # / len(x)
        test_loss += loss.detach().cpu().item()

        correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
        total += len(x)

        if total > 1000:
            break
    
print("Latest Model")
print(f"Test loss: {test_loss:.2f}")
print(f"Test accuracy: {correct / total * 100:.2f}%")

In [None]:
# Test loop on best model
# torch.cuda.empty_cache()
# test_model = ConvAcTransformer(d_model=512, attention_heads=4, num_layers=4, num_classes=101, feature_extractor_name='efficientnet_v2_s')
# test_model = test_model.to(device=DEVICE)
# MODEL_PATH = 'best_model_ucf101_x128'

# criterion = nn.CrossEntropyLoss()
# checkpoint = torch.load(f"./{MODEL_PATH}.pt")
# test_model.load_state_dict(checkpoint['model_state_dict'])

correct, total = 0, 0
test_loss = 0.0
best_model.eval()
for batch in tqdm(test_loader, desc="Testing", position=0, leave=True):
    x, y = batch
    x, y = x.to(DEVICE), y.to(DEVICE)
    y_hat = best_model(x)
    loss = criterion(y_hat, y) # / len(x)
    test_loss += loss.detach().cpu().item()

    correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
    total += len(x)

    if total > 17:
        break
    
print("Best Model")
print(f"Test loss: {test_loss:.2f}")
print(f"Test accuracy: {correct / total * 100:.2f}%")