In [1]:
#Library import
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import ViTModel
from transformers import ViTImageProcessor
import math
import os
import imageio
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path
import logging
from datetime import datetime
!pip install tensorboard

2024-03-11 12:04:46.630423: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 12:04:46.630541: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 12:04:46.736275: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered




In [2]:
#Kaggle requirement
!pip install imageio[ffmpeg]

Collecting imageio-ffmpeg (from imageio[ffmpeg])
  Downloading imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl (26.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.9/26.9 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: imageio-ffmpeg
Successfully installed imageio-ffmpeg-0.4.9


In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len, temperature = 10000):
        '''
        d_model: feature dimension (default = 768)
        seq_len: sequence length
        '''
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.temperature = temperature


    def forward(self):
        pos = torch.arange(self.seq_len, dtype=torch.float32).unsqueeze(1)              # pos = [[0], [1], ..., [seq_len-1]]
        i = torch.arange(self.d_model // 2, dtype=torch.float32).unsqueeze(0)           # i = [[0, 1, ..., d_model/2 - 1]]

        # Compute the positional encodings
        angle_rates = 1 / (self.temperature ** (2 * i / self.d_model))
        pos_encoding = torch.zeros(self.seq_len, self.d_model, dtype=torch.float32)
        pos_encoding[:, 0::2] = torch.sin(pos * angle_rates)
        pos_encoding[:, 1::2] = torch.cos(pos * angle_rates)

        # Add a dimension for batch size
        pos_encoding = pos_encoding.unsqueeze(0)

        # Disable gradient because PE are not learnable parameters
        pos_encoding.requires_grad_(False)

        return pos_encoding.to(get_device_available())     # pos_encoding = [1, seq_len, 768]

class InputEmbeddings(nn.Module):
    def __init__(self):
        super().__init__()
        model_name = 'google/vit-base-patch16-224'
        self.emb_model = ViTModel.from_pretrained(model_name)
        self.emb_processor = ViTImageProcessor.from_pretrained(model_name)
        self.emb_model.to(get_device_available())

    def forward(self, frames):
        '''
        frames: sequence of PIL Image (batch, seq_len, channel, width, height)
        input_embed: (batch, seq_len, d_model)
        '''
        #input_embeds holds embedded frames of the batch (input_embeds = [batch, seq_len, d_model])
        input_embeds = []
        for i in range(frames.size(dim=0)):
          #input_embed holds embedded frames of one sample
          input_embed = []
          single_sample = frames[i]
          for frame in single_sample:
              #Embed frames
              inputs = self.emb_processor(images=frame, return_tensors='pt')
              pixel_values = inputs.pixel_values.to(get_device_available())                      # pixel_values = [1, 3, 224, 224]
              with torch.no_grad():
                  output = self.emb_model(pixel_values)
                  # Get the representation of the entire frame
                  output = output.last_hidden_state.mean(dim=1)       # shape = [d_model]
                  input_embed.append(output)
          input_embed = torch.cat(input_embed, dim=0)                 # input_embed = [seq_len, d_model]
          input_embeds.append(input_embed)
        input_embeds = torch.stack(input_embeds)
        d_model = input_embed.shape[-1]
        # Scale the embeddings
        input_embeds = input_embeds * math.sqrt(d_model)

        return input_embeds        # input_embed = [batch, seq_len, d_model]

class LayerNormalization(nn.Module):
    def __init__(self, d_model, epsilon=10**-6):
        super().__init__()
        self.epsilon = epsilon
        self.alpha = nn.Parameter(torch.ones(d_model))      # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(d_model))      # bias is a learnable parameter

    def forward(self, x):
        '''
        Args:
            x: (batch, seq_len, d_model)
            return: normalized x (batch, seq_len, d_model)
        '''
        mean = x.mean(dim=-1, keepdim=True)         # (batch, seq_len, 1)
        std = x.std(dim=-1, keepdim=True)           # (batch, seq_len, 1)
        return self.alpha * (x-mean) / (std + self.epsilon) + self.bias     # (batch, seq_len, d_model)

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.linear_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        '''
        x: (batch, seq_len, d_model)
        '''
        output = self.linear_1(x)       # output: (batch, seq_len, d_ff)
        output = torch.relu(output)
        output = self.dropout(output)
        output = self.linear_2(output)  # output: (batch, seq_len, d_model)

        return output

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads

        w_q = [nn.Linear(d_model, d_model) for _ in range(num_heads)]      # w_q = (num_heads, d_model, d_model)
        w_k = [nn.Linear(d_model, d_model) for _ in range(num_heads)]      # w_k = (num_heads, d_model, d_model)
        w_v = [nn.Linear(d_model, d_model) for _ in range(num_heads)]      # w_v = (num_heads, d_model, d_model)

        self.w_q = nn.ModuleList(w_q)
        self.w_k = nn.ModuleList(w_k)
        self.w_v = nn.ModuleList(w_v)
        self.w_o = nn.Linear(num_heads * d_model, d_model, bias=False)      # w_o = (num_heads * d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        '''
        x: (batch, seq_len, d_model)
        '''
        q = [layer(x) for layer in self.w_q]        # q[i] = (batch, seq_len, d_model)
        k = [layer(x) for layer in self.w_k]        # k[i] = (batch, seq_len, d_model)
        v = [layer(x) for layer in self.w_v]        # v[i] = (batch, seq_len, d_model)
        q, k, v = torch.stack(q), torch.stack(k), torch.stack(v)        # q, k, v = (num_heads, batch, seq_len, d_model)
        q, k, v = q.permute(1, 0, 2, 3), k.permute(1, 0, 2, 3), v.permute(1, 0, 2, 3)                              # q, k, v = (batch, num_heads, seq_len, d_model)

        k_transpose = k.transpose(-2, -1)           # k_transpose = (batch, num_heads, d_model, seq_len)
        attention_scores = q @ k_transpose          # attention_score = (batch, num_heads, seq_len, seq_len)

        # Normalise the attention scores
        attention_scores = attention_scores / self.d_model      # attention_scores = (batch, num_heads, seq_len, seq_len)
        # Apply softmax to attention scores
        attention_scores = attention_scores.softmax(dim=-1)     # attention_scores = (batch, num_heads, seq_len, seq_len)

        # Dropout
        if self.dropout is not None:
            attention_scores = self.dropout(attention_scores)   # attention_scores = (batch, num_heads, seq_len, seq_len)

        # Calculate all heads
        heads = attention_scores @ v                            # heads = (batch, num_heads, seq_len, d_model)

        # Concatenate heads along the seq_len dimension
        heads = heads.transpose(1, 2)                                           # heads = (batch, seq_len, num_heads, d_model)
        #heads_values is a copy of heads (Keep the original head before output calculation)
        heads_values = heads.clone()
        heads = heads.contiguous().view(heads.shape[0], heads.shape[1], -1)     # heads = (batch, seq_len, num_heads * d_model)

        # Linear transform with output weights
        output = self.w_o(heads)                                # output = (batch, seq_len, d_model)

        # Compute SCL loss
        scl_module = SCL()
        scl_value = scl_module(heads_values)
        return output, scl_value

class AddNormBlock(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(d_model)

    def forward(self, x, sublayer, *args, **kwargs):
        """
        Apply residual connection to any sublayer with the same size.
        x: Input tensor
        sublayer: A function representing the sublayer (e.g., multi-head attention, feed-forward)
        args: Additional positional arguments for the sublayer
        kwargs: Additional keyword arguments for the sublayer
        """
        output = sublayer(x)
        if isinstance(output, tuple):
            #If output not a single tensor -> MHSA (SCL loss is calculated along MHSA output)
            output, _ = output
        return self.norm(x + self.dropout(output))

class EncoderBlock(nn.Module):
    def __init__(self, mhsa_block: MultiHeadAttentionBlock,
                 feed_forward_block: FeedForwardBlock,
                 d_model: int,
                 dropout: float):
        super().__init__()
        self.mhsa_block = mhsa_block
        self.feed_forward_block = feed_forward_block
        self.add_norm_block = nn.ModuleList([AddNormBlock(d_model, dropout) for _ in range(2)])

    def forward(self, x):
        '''
        Args:
            x: input [batch, seq_len, d_model]
        '''
        #Get SCL loss for input x
        _, loss = self.mhsa_block(x)
        x = self.add_norm_block[0](x, lambda x: self.mhsa_block(x))
        x = self.add_norm_block[1](x, self.feed_forward_block)
        return x, loss

class Encoder(nn.Module):
    def __init__(self, d_model: int, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(d_model)

    def forward(self, x):
        loss = None               #Sum up all losses as final loss => Update whole model
        for layer in self.layers:
          if loss == None:
            #Init loss
            x, loss = layer(x)
          else:
            #Add up tmp_loss to loss
            x, tmp_loss = layer(x)
            loss += tmp_loss

        return self.norm(x), loss

class PredictionLayer(nn.Module):
    def __init__(self, d_model):
        '''
        Args:
            d_model: feature dimension of an input embedding
        '''
        super().__init__()
        self.fc = nn.Linear(d_model, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        '''
        Take the Transformer Encoder's output of the last frame in the sequence to predict the embedding of the next frame
        Args:
            x: (batch, seq_len, d_model)
            return: (batch, d_model)
        '''
        # Get the last row, which is the Attention encoded representation of the last frame
        x = x[:, -1, :]                 # x = (batch, d_model)
        x = self.relu(self.fc(x))       # x = (batch, d_model)

        return x

class SCPModel(nn.Module):
    def __init__(self, encoder: Encoder, pred_layer: PredictionLayer, src_embed: InputEmbeddings, src_pos: PositionalEncoding):
        super().__init__()
        self.encoder = encoder
        self.pred_layer = pred_layer
        self.src_embed = src_embed
        self.src_pos = src_pos

    def forward(self, src):
        '''
        Args:
            src: 'n' frames
        '''

        # Get input embedding
        src = self.src_embed(src)        # src = (batch, seq_len, d_model)

        # Get positional encoding
        pos_encoding = self.src_pos()       # pos_encoding = (1, seq_len, d_model)

        # Add input embedding + positional encoding to generate the complete input
        input = src + pos_encoding          # input = (batch, seq_len, d_model)

        # Get output and loss from the encoder module
        output, loss = self.encoder(input)        # output = (batch, seq_len, d_model), SCL_value

        # Get output from the prediction layer module
        output = self.pred_layer(output)    # output = (batch, embed_num_features * d_model)

        return output, loss

def build_model(d_model, seq_len, N = 6, h = 8, dropout = 0.1, d_ff = 2048, device='cpu'):
    '''
    d_model: feature dimension of an input embedding
    seq_len: length of the input sequence
    N: number of encoder blocks in the model
    h: number of heads for multi-head self-attention
    d_ff: the dimension of the hidden layer of Feed Forward Block
    '''
    # Input embedding layer
    src_embed = InputEmbeddings().to(device)

    # Positional encoding layer
    pos_enc = PositionalEncoding(d_model, seq_len).to(device)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        mhsa_block = MultiHeadAttentionBlock(d_model, h, dropout).to(device)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout).to(device)
        encoder_block = EncoderBlock(mhsa_block, feed_forward_block, d_model, dropout).to(device)
        encoder_blocks.append(encoder_block)

    # Create the encoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks)).to(device)

    # Create the prediction layer
    pred_layer = PredictionLayer(d_model).to(device)

    # Create the Semantic Concentration Encoder
    model = SCPModel(encoder, pred_layer, src_embed, pos_enc).to(device)

    # Initialise the parameters of the model
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model

In [4]:
class MSE(nn.Module):
    def __init__(self, norm_dim = None):
        """
        Args:
            norm_dim: the dimension that we want to normalize the ground truth and the prediction
        """
        super().__init__()
        self.norm_dim = norm_dim

    def forward(self, pred, gt):
        """
        Args:
            pred: prediction made by model, tensor with shape [batch, d_model]
            gt: ground truth value, tensor with shape [batch, d_model]
        """
        if self.norm_dim is not None:
            pred = F.normalize(pred, p=2, dim=self.norm_dim)
            gt = F.normalize(gt, p=2, dim=self.norm_dim)

        squared_error = torch.square(pred - gt)     # squared_error = [batch, d_model]
        mse = torch.mean(squared_error)             # mse = [] (scalar tensor)
        return mse

# ---- Semantic Concentration Loss ----
class SCL(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, heads):
        '''
        Compute the  dissimilarity between each head in each batch and then calculate the average across all batches.
        Args:
            heads = (batch, num_heads, seq_len, d_model)
        Return:
            Average cosine similarity between batches
        '''
        num_batches, num_heads, _, _ = heads.shape
        if num_heads <= 1:
            raise ValueError("There must be at least 2 heads to compute SCL.")

        loss = 0
        for batch in range(num_batches):
            batch_scl = count = 0
            for head1 in range(num_heads - 1):
                for head2 in range(head1 + 1, num_heads):
                    # Flatten the head tensors
                    head_tensor1 = heads[batch, head1].flatten()        # head_tensor1 = (seq_len * d_model)
                    head_tensor2 = heads[batch, head2].flatten()        # head_tensor2 = (seq_len * d_model)

                    # Compute SCL loss for the current pair of heads
                    similarity = F.cosine_similarity(head_tensor1.unsqueeze(0), head_tensor2.unsqueeze(0))
                    scl = 1 - similarity
                    # Calculate the total Semantic Concentration Loss for the current batch
                    batch_scl += scl
                    count += 1

            batch_scl /= count
            loss += batch_scl

        loss /= num_batches
        return loss

In [5]:
class KTHDataset(Dataset):
    def __init__(self, root_dir, N):
        '''
        root_dir: directory of the dataset
        N: number of past frames
        '''
        self.root_dir = root_dir
        self.N = N
        self.samples = []

        #Init embedding model when create instance of dataset -> Avoid model retrieval loops
        model_name = 'google/vit-base-patch16-224'
        self.emb_model = ViTModel.from_pretrained(model_name)
        self.emb_processor = ViTImageProcessor.from_pretrained(model_name)
        self.emb_model.to(get_device_available())

        # Iterate through each category folder and collect video paths
        for category in os.listdir(root_dir):
            category_folder = os.path.join(root_dir, category)
            if os.path.isdir(category_folder):
                for video_file in os.listdir(category_folder):
                    video_path = os.path.join(category_folder, video_file)
                    # Get the total number of frames of the current video
                    num_frames = self.get_total_frames(video_path)
                    # Add each possible sequence in this video
                    for i in range(num_frames - N):
                        self.samples.append((video_path, i))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        video_path, start_frame = self.samples[idx]

        # Get N+1 frames of the video_path, starting from the "start_frame" index
        frames = self.read_frames(video_path, start_frame, self.N + 1)

        # Process each frame to have the shape [channels, width, height]
        processed_frames = [self.process_frame(frame) for frame in frames]

        # Stack N frames for input data
        data = torch.stack(processed_frames[:-1])   # data = [N, channels, width, height]

        # Output data (label) is the ViT processed features of the last frame
        last_frame = processed_frames[-1]                # last_frame = [channels, width, height]
        inputs = self.emb_processor(images=last_frame, return_tensors='pt')
        pixel_values = inputs.pixel_values.to(get_device_available())
        with torch.no_grad():
            label = self.emb_model(pixel_values)
            # Representation of the entire frame
            label = label.last_hidden_state.mean(dim=1)     # label = [1, 768]
            label = label.squeeze()                         # label = [768]

        return data, label

    def read_frames(self, video_path, start_frame, num_frames):
        frames = []
        try:
            reader = imageio.get_reader(video_path)

            # Skipping to the start frame
            for _ in range(start_frame):
                _ = reader.get_next_data()

            # Reading the required number of frames
            for _ in range(num_frames):
                frame = reader.get_next_data()
                frames.append(frame)

        except Exception as e:
            print(f"Error reading frames from {video_path}: {e}")

        return frames

    def get_total_frames(self, video_path):
        num_frames = 0
        try:
            reader = imageio.get_reader(video_path)
            for _ in reader:
                num_frames += 1
        except Exception as e:
            print(f"Error counting frames in video file {video_path}: {e}")

        return num_frames

    def process_frame(self, frame):
        '''
        Process a frame to have the shape of [channels, width, height]
        Args:
            frame: the frame to be processed
        '''
        # Convert the frame to a PyTorch tensor
        frame_tensor = torch.from_numpy(frame)

        # The frame is originally in (H, W, C) format => convert to (C, W, H)
        frame_tensor = frame_tensor.permute(2, 0, 1)

        return frame_tensor

def get_dataloader(dataset, batch_size, train_split=0.7, val_split=0.15, test_split=0.15, shuffle_train=True):
    """
    Create DataLoaders for training, validation, and testing.
    Args:
        dataset: the processed dataset. Each data instance is a tuple of (data, label).
        batch_size: Batch size for the DataLoaders.
        train_split: Proportion of data to use for training.
        val_split: Proportion of data to use for validation.
        test_split: Proportion of data to use for testing.
        shuffle_train: Whether to shuffle the training dataset.
    return:
        A tuple of DataLoaders (train_loader, val_loader, test_loader).
    """
    # Calculate the sizes of each split
    dataset_size = len(dataset)
    train_size = int(dataset_size * train_split)
    val_size = int(dataset_size * val_split)
    test_size = dataset_size - (train_size + val_size)

    # Split the dataset
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    # Create DataLoaders for each split
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_train)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [6]:
def get_device_available():
    ''' Detect available training device'''
    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    return device

def set_seed(seed):
    ''' Set random seed '''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = get_device_available()
    if device == torch.device("mps"):
        torch.mps.manual_seed(seed)
    elif device == torch.device("cuda"):
        torch.cuda.manual_seed(seed)

In [7]:
class Loss_tuple(object):
    def __init__(self):
        self.train = []
        self.val = []

def init_loss_dict(loss_name_list, history_loss_dict = None):
    loss_dict = {}
    for name in loss_name_list:
        loss_dict[name] = Loss_tuple()
    loss_dict['epochs'] = 0

    if history_loss_dict is not None:
        for k, v in history_loss_dict.items():
            loss_dict[k] = v

        for k, v in loss_dict.items():
            if k not in history_loss_dict:
                lt = Loss_tuple()
                lt.train = [0] * history_loss_dict['epochs']
                lt.val = [0] * history_loss_dict['epochs']
                loss_dict[k] = lt

    return loss_dict

def load_ckpt(ckpt_path, model, optimizer, loss_dict):
    ckpt = torch.load(ckpt_path)

    # Retrieve the training parameters
    epoch = ckpt['epoch']
    loss_dict = ckpt['loss_dict']
    model_state_dict = ckpt['model_state_dict']
    optimizer_state_dict = ckpt['optimizer_state_dict']

    return epoch, loss_dict, model_state_dict, optimizer_state_dict

def save_ckpt(model, optimizer, epoch, loss_dict, save_dir):
    if not Path(save_dir).exists():
      Path(save_dir).mkdir(parents=True, exist_ok=True)
    ckpt_file = Path(save_dir).joinpath(f"epoch_{epoch}.tar")

    model_state = model.state_dict()
    optimizer_state = optimizer.state_dict()

    torch.save({
        'epoch': epoch,
        'loss_dict': loss_dict,
        'model_state_dict': model_state,
        'optimizer_state_dict': optimizer_state
    }, ckpt_file.absolute().as_posix())

In [8]:
def single_iter(model, optimizer, sample, device, task_loss, lambda_scl_loss=1, train_flag=True):
  '''
  Inputs:
  model: SCP model
  optimizer: Optimizer used
  sample: A single batch from loader
  device: Operating device (cuda/mps/cpu) -> Can be optimized by using get_available_device
  task_loss: Loss function for specific task
  lambda_scl_loss: Weight of SCL loss
  train_flag: True if training, False if evaluating
  '''
  #Get frames and label to the device
  past_frames, label = sample
  past_frames.to(device)
  label.to(device)

  #Train phase
  if train_flag:
    #Reset gradients of model
    model.zero_grad(set_to_none=True)
    #Forward pass
    pred_frame, scl_loss = model(past_frames)

    if optimizer is not None:
      #All parameters need to update have been set to require_grad = True earlier => Skip this modify step
      task_loss = task_loss(pred_frame, label)
      #Apply loss formula (total loss = task_loss + scl_loss * lambda)
      total_loss = task_loss + torch.mul(scl_loss, lambda_scl_loss)
      total_loss.backward()
      optimizer.step()

  #Evaluate phase
  else:
    #Reset gradients of model
    model.zero_grad(set_to_none=True)
    #Forward pass
    pred_frame, scl_loss = model(past_frames)

    if optimizer is not None:
      #All parameters need to update have been set to require_grad = True earlier => Skip this modify step
      task_loss = task_loss(pred_frame, label)
      #Apply loss formula (total loss = task_loss + scl_loss * lambda)
      total_loss = task_loss + torch.mul(scl_loss, lambda_scl_loss)

  iter_loss_dict = {'Total': total_loss, 'MSE': task_loss, 'SCL': scl_loss}
  return iter_loss_dict

def write_summary(summary_writer, loss_dict, train_flag=True):
  curr_loss = loss_dict.copy()
  if (train_flag):
    for k, v in curr_loss.items():
      #Exclude k = epochs when writing to tensorboard
      if (k != 'epochs'):
        for i in range(len(v.train)):
          summary_writer.add_scalars(k, {'train': v.train[i]}, i+1)
  else:
    for k, v in curr_loss.items():
      #Exclude k = epochs when writing to tensorboard
      if (k != 'epochs'):
        for i in range(len(v.val)):
          summary_writer.add_scalars(k, {'val': v.val[i]}, i+1)

In [9]:

        set_seed(2023)
        ckpt_save_dir = Path.cwd().joinpath('checkpoint')
        tensorboard_save_dir = Path.cwd().joinpath('tensorboard')
        resume_ckpt = None

        if not Path(ckpt_save_dir).exists():
                Path(ckpt_save_dir).mkdir(parents=True, exist_ok=True)
        logging.basicConfig(level=logging.INFO,
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        format='[%(levelname)s] %(message)s - (%(filename)s)',      # e.g., [INFO] Log message 1 - (main.py)
                        filename=ckpt_save_dir.joinpath('train_log.log').absolute().as_posix(),
                        filemode='a')

        start_epoch = 0
        summary_writer = SummaryWriter(tensorboard_save_dir.absolute().as_posix())
        num_past_frames = 10
        epochs = 10
        lr = 1e-4
        dropout = 0.1
        device = get_device_available()         # device = cuda/mps if available. Otherwise, device = cpu

        ##################### Init Dataset ###########################
        root_dir = '/kaggle/input'
        seq_len = 5               # Number of previous frames
        batch_size = 32
        train_split = 0.001
        val_split = 0.0005
        test_split = 0.0005
        val_per_epoch = 4
        # Initialise the dataset
        full_dataset = KTHDataset(root_dir, seq_len)
        # Get the dataloader
        train_loader, val_loader, test_loader = get_dataloader(full_dataset, batch_size, train_split, val_split, test_split)

        ##################### Init Loss Function ###########################
        loss_name_list = ['MSE', 'SCL', 'Total']
        loss_dict = init_loss_dict(loss_name_list)
        mse = MSE()

        ##################### Resume training from checkpoint ###########################
        if resume_ckpt is not None:
                ckpt = torch.load(resume_ckpt)
                # Restore the model and optimizer state
                model.load_state_dict(ckpt['model_state_dict'])
                optimizer.load_state_dict(ckpt['optimizer_state_dict'])
                # Load other training parameters
                start_epoch = ckpt["epoch"]
                loss_dict = ckpt["loss_dict"]


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [None]:
##################### Init Model ###########################
d_model = 768               # feature dimension of an input embedding
N = 6                       # Number of encoder blocks in the model
h = 8                       # Number of heads
model = build_model(d_model, seq_len, N, h, device=device)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
model_num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of model's parameters: {model_num_params}")

##################### Train ###########################
for epoch in range(start_epoch + 1, epochs +  1):
  loss_dict['epochs'] = epoch
  #Init temporary loss dict (Individual loss of each iteration)
  iters_loss_dict = init_loss_dict(loss_name_list)

  #Get current date/time
  epoch_time = datetime.now()
  epoch_loss_dict = init_loss_dict(loss_name_list)
  #Train phase
  iter_count = len(train_loader.dataset)
  for idx, sample in enumerate(train_loader, 0):
    iter_loss_dict = single_iter(model, optimizer, sample, device, mse)
    #Add loss to epoch loss
    for k, v in iter_loss_dict.items():
      iters_loss_dict[k].train.append(iter_loss_dict[k])
  #Take average epoch loss
  for k, v in iters_loss_dict.items():
    if (k != 'epochs'):
      loss_concat = torch.stack(v.train)
      loss_dict[k].train.append(torch.mean(loss_concat))
  write_summary(summary_writer, loss_dict)

  if (epoch % val_per_epoch == 0):
    #Evaluation phase
    for idx, sample in enumerate(val_loader, 0):
      iter_loss_dict = single_iter(model, optimizer, sample, device, mse, train_flag = False)
    #Add loss to epoch loss
      for k, v in iter_loss_dict.items():
        iters_loss_dict[k].val.append(iter_loss_dict[k])
    #Take average epoch loss
    for k, v in iters_loss_dict.items():
      if (k != 'epochs'):
        loss_concat = torch.stack(v.val)
        loss_dict[k].val.append(torch.mean(loss_concat))
    write_summary(summary_writer, loss_dict, train_flag=False)

  #Save checkpoint
    
  epoch_time_used = datetime.now() - epoch_time

  logging.info(f"epoch {epoch}, {epoch_loss_dict['Total']}")
  logging.info(f"Estimated remaining training time: {epoch_time_used.total_seconds()/3600. * (start_epoch + epochs - epoch)} Hours")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total number of model's parameters: 219247872
{'Total': tensor([1.1392], device='cuda:0', grad_fn=<AddBackward0>), 'MSE': tensor(1.0057, device='cuda:0', grad_fn=<MeanBackward0>), 'SCL': tensor([0.1335], device='cuda:0', grad_fn=<AddBackward0>)}
{'Total': tensor([0.7853], device='cuda:0', grad_fn=<AddBackward0>), 'MSE': tensor(0.6550, device='cuda:0', grad_fn=<MeanBackward0>), 'SCL': tensor([0.1303], device='cuda:0', grad_fn=<AddBackward0>)}
{'Total': tensor([0.6462], device='cuda:0', grad_fn=<AddBackward0>), 'MSE': tensor(0.5093, device='cuda:0', grad_fn=<MeanBackward0>), 'SCL': tensor([0.1369], device='cuda:0', grad_fn=<AddBackward0>)}
{'Total': tensor([0.5840], device='cuda:0', grad_fn=<AddBackward0>), 'MSE': tensor(0.4425, device='cuda:0', grad_fn=<MeanBackward0>), 'SCL': tensor([0.1415], device='cuda:0', grad_fn=<AddBackward0>)}
{'Total': tensor([0.5844], device='cuda:0', grad_fn=<AddBackward0>), 'MSE': tensor(0.4493, device='cuda:0', grad_fn=<MeanBackward0>), 'SCL': tensor([0.135

In [None]:
!tensorboard --logdir=tensorboard