In [1]:
from video_utils import *
from text_utils import *
import mediapy as media

# Testing video loader
video = load_video('videos/1.mov', use_aug=True)
media.show_video(video)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\masoud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0
This browser does not support the video tag.


## Let's build a Torch DataLoader that generates both video samples and text samples

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

class VideoCaptionDataset(Dataset):

    def __init__(self, csv_file: str, augment: False, n_samples=30):
        """
        Args:
            csv_file (string): Path to the csv file contining 
                path and captions.
            augment (list): Optional transform to be applied
                on a sample.
        """
        # Read the video path and corresponding caption 
        self.video_paths, self.tf_idf_matrix = load_df(csv_file)
        self.n_samples = n_samples # how many frames to sample
        self.augment = augment # Augment the videos or not

    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        video_path = self.video_paths[idx]
        vector = self.tf_idf_matrix[idx]
        # Sample N frames randomly from each video (input shape: Batch*N*H*W*C)
        video = load_video(video_path, n_frames=self.n_samples, use_aug=self.augment)
        
        sample = {
            'video': video,
            'caption': vector.toarray()
        }

        return sample

# Testing DataLoader


dataset = VideoCaptionDataset(csv_file='dataset.csv', augment=True)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched['video'].size(), sample_batched['caption'].size())
    media.show_videos(np.array(sample_batched['video']))


0 torch.Size([4, 30, 240, 320, 3]) torch.Size([4, 1, 22])


0,1,2,3
This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.


1 torch.Size([4, 30, 240, 320, 3]) torch.Size([4, 1, 22])


0,1,2,3
This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.


2 torch.Size([2, 30, 240, 320, 3]) torch.Size([2, 1, 22])


0,1
This browser does not support the video tag.,This browser does not support the video tag.


## Q: Explain why did you design the video dataloader in this way? 

The dataloader loads two parts. 
- Part 1: Samples N frames from each video and returns Batch\*N\*H\*W\*C of video which is used to train 3d CNNs or convLSTM (or attention-based networks) network to extract spatiotemporal features.
- Part 2: Reads the captions and after preprocessing the text, Sentences would be converted into vectors by `tf-idf Vectorizer` and then it is gonna be fed to a network with LSTM shape.



## Q. What are the weaknesses of your video loader?

- 1: One of the bottlenecks of dataloader is using `opencv` to iterate through the video and sample the frames. this makes the process so slow.
- 2: It is a good practice to cache data in numpy format and read them using torch functions to make the process speed up.
- 3: In case we have big amount of text data, it is not good to train the Vectorizer each time we initialize the `dataloader`. It's also not good to keep all the vectors in the RAM, which could be costly for training process.