# Before running other notebooks make sure you have these items prepared.
- directory named `videos` containing videos from `1.mp4` to `10.mp4`

In [1]:
import pandas as pd
from video_utils import *
import mediapy as media
from text_utils import preprocess_text, load_df2, load_sents2, pad2, tokenize2


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\masoud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Testing preprocess

In [2]:
df = pd.read_csv('dataset.csv')

sentence_arr = [preprocess_text(i) for i in df.caption]
sentence_arr

['batsman fails hit ball',
 'batsman fails run due lack ball control',
 'batsman hits ball',
 'guy bowling ball',
 'ball hits wicket',
 'batsman hits',
 'batsman hits ball catcher fails recieve',
 'catch ball gets far',
 'wicket hit ball',
 'wicket keeper keep watching whole time']

In [3]:
# Should I remove stopwords or not -> https://stackoverflow.com/a/37330543/6118987

text_tokenized, text_tokenizer = tokenize2(sentence_arr)
print(text_tokenizer.word_index)

for sample_i, (sent, token_sent) in enumerate(zip(sentence_arr, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'ball': 1, 'batsman': 2, 'hits': 3, 'fails': 4, 'wicket': 5, 'hit': 6, 'run': 7, 'due': 8, 'lack': 9, 'control': 10, 'guy': 11, 'bowling': 12, 'catcher': 13, 'recieve': 14, 'catch': 15, 'gets': 16, 'far': 17, 'keeper': 18, 'keep': 19, 'watching': 20, 'whole': 21, 'time': 22}
Sequence 1 in x
  Input:  batsman fails hit ball
  Output: [2, 4, 6, 1]
Sequence 2 in x
  Input:  batsman fails run due lack ball control
  Output: [2, 4, 7, 8, 9, 1, 10]
Sequence 3 in x
  Input:  batsman hits ball
  Output: [2, 3, 1]
Sequence 4 in x
  Input:  guy bowling ball
  Output: [11, 12, 1]
Sequence 5 in x
  Input:  ball hits wicket
  Output: [1, 3, 5]
Sequence 6 in x
  Input:  batsman hits
  Output: [2, 3]
Sequence 7 in x
  Input:  batsman hits ball catcher fails recieve
  Output: [2, 3, 1, 13, 4, 14]
Sequence 8 in x
  Input:  catch ball gets far
  Output: [15, 1, 16, 17]
Sequence 9 in x
  Input:  wicket hit ball
  Output: [5, 6, 1]
Sequence 10 in x
  Input:  wicket keeper keep watching whole time
  Outpu

In [4]:
# testing padding.
test_pad = pad2(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [2 4 6 1]
  Output: [2 4 6 1 0 0 0]
Sequence 2 in x
  Input:  [ 2  4  7  8  9  1 10]
  Output: [ 2  4  7  8  9  1 10]
Sequence 3 in x
  Input:  [2 3 1]
  Output: [2 3 1 0 0 0 0]
Sequence 4 in x
  Input:  [11 12  1]
  Output: [11 12  1  0  0  0  0]
Sequence 5 in x
  Input:  [1 3 5]
  Output: [1 3 5 0 0 0 0]
Sequence 6 in x
  Input:  [2 3]
  Output: [2 3 0 0 0 0 0]
Sequence 7 in x
  Input:  [ 2  3  1 13  4 14]
  Output: [ 2  3  1 13  4 14  0]
Sequence 8 in x
  Input:  [15  1 16 17]
  Output: [15  1 16 17  0  0  0]
Sequence 9 in x
  Input:  [5 6 1]
  Output: [5 6 1 0 0 0 0]
Sequence 10 in x
  Input:  [ 5 18 19 20 21 22]
  Output: [ 5 18 19 20 21 22  0]


## Creating DataLoader

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class VideoCaptionDataset(Dataset):

    def __init__(self, csv_file: str, augment: False, n_samples=30):
        """
        Args:
            csv_file (string): Path to the csv file contining 
                path and captions.
            augment (list): Optional transform to be applied
                on a sample.
        """
        self.video_paths, self.vectorized_sents = load_df2(csv_file)
        self.n_samples = n_samples
        self.augment = augment

    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        video_path = self.video_paths[idx]
        vector = self.vectorized_sents[idx]
        video = load_video(video_path, n_frames=self.n_samples, use_aug=self.augment)

        sample = {
            'video': video,
            'caption': vector
        }

        return sample

dataset = VideoCaptionDataset(csv_file='dataset.csv', augment=True)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched['video'].size(), sample_batched['caption'].size())
    media.show_videos(np.array(sample_batched['video']))

0 torch.Size([4, 30, 240, 320, 3]) torch.Size([4, 7])


0,1,2,3
This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.


1 torch.Size([4, 30, 240, 320, 3]) torch.Size([4, 7])


0,1,2,3
This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.,This browser does not support the video tag.


2 torch.Size([2, 30, 240, 320, 3]) torch.Size([2, 7])


0,1
This browser does not support the video tag.,This browser does not support the video tag.


## Q: Explain why did you design the video dataloader in this way? 

The dataloader loads two parts. 
- Part 1: Samples N frames from each video and returns Batch\*N\*H\*W\*C of video which is used to train 3d CNNs or convLSTM (or attention-based networks) network to extract spatiotemporal features.
- Part 2: Reads the captions and after preprocessing the text, Sentences would be converted into vectors to be fed into transformer-alike CNNs.



## Q. What are the weaknesses of your video loader?

- 1: One of the bottlenecks of dataloader is using `opencv` to iterate through the video and sample the frames. this makes the process so slow.
- 2: It is a good practice to cache data in numpy format and read them using torch functions to make the process speed up.
- 3: In case we have big amount of text data, it is not good to train the Vectorizer each time we initialize the `dataloader`. It's also not good to keep all the vectors in the RAM, which could be costly for training process.