## VIDEO CAPTIONING WITH MSR-VTT DATASET
* ### Full dataset is available here:  https://www.mediafire.com/folder/h14iarbs62e7p/shared

* ### A trained version of the model can be found here: https://drive.google.com/file/d/1-GfiBd_CfW0IcUNOMpGDwUIReXA_z0sn/view?usp=sharing. This was trained using 5 frames per video, with batch size 128.

* ### We have modified the code originally written for image captioning in Assignment 3. Thanks to Ziyan Yang  (zy3cx@virginia.edu)!

## Creating a Dataloader

In [1]:
from google.colab import drive
drive.mount('/content/drive',  force_remount=True)

json_file = "/content/drive/My Drive/Colab Notebooks/Vision Project/train_val_annotation/train_val_videodatainfo.json"
video_file = "/content/drive/My Drive/Colab Notebooks/Vision Project/TrainValVideo/"

Mounted at /content/drive


In [13]:
import json
import albumentations as alb
import torchtext

%run DatasetManager.ipynb
tokenizer = torchtext.data.Field(sequential = True,
                                     init_token = "<start>", eos_token = "<end>", 
                                     pad_token = "<pad>", unk_token = "<unk>",
                                     batch_first = True)

transform = alb.Compose([
    alb.Resize(300, 300, always_apply=True),
    alb.CenterCrop(224, 224, always_apply=True),
    alb.Normalize(mean = [0.43216, 0.394666, 0.37645],
                std = [0.22803, 0.22145, 0.216989], 
                always_apply=True)
])

frames = 5

# Create dataset classes for training and validation.
train_dataset = DatasetManager(tokenizer, 'train', json_file, video_file, 
                                    12000, video_transform =  transform, frames = frames)

train_data_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size = 128, 
                                           num_workers = 5,
                                           shuffle = True,
                                           collate_fn = train_dataset.create_batch)

val_dataset = DatasetManager(tokenizer, 'validation', json_file, video_file, 
                                    12000, video_transform =  transform, frames = frames)

val_data_loader = torch.utils.data.DataLoader(val_dataset, 
                                           batch_size = 128, 
                                           num_workers = 5,
                                           shuffle = False,
                                           collate_fn = val_dataset.create_batch)

100%|██████████| 20000/20000 [00:00<00:00, 68114.45it/s]
100%|██████████| 5000/5000 [00:00<00:00, 47653.23it/s]


In [14]:
(videos, texts, text_lengths) = next(iter(val_data_loader))
print(videos.shape, texts.shape, text_lengths)

token_id = texts[0].tolist()

for id in(token_id):
    print(tokenizer.vocab.itos[id], end= ' ')

torch.Size([128, 5, 3, 224, 224]) torch.Size([128, 27]) [7, 9, 9, 9, 13, 11, 12, 6, 10, 8, 8, 7, 8, 12, 13, 9, 9, 12, 11, 18, 8, 11, 18, 12, 11, 6, 7, 11, 8, 15, 6, 13, 11, 9, 14, 8, 10, 14, 24, 15, 12, 18, 27, 12, 15, 11, 13, 12, 12, 7, 11, 10, 10, 9, 8, 7, 12, 20, 9, 7, 19, 8, 16, 8, 8, 10, 6, 13, 12, 9, 9, 9, 13, 13, 11, 6, 14, 6, 10, 10, 8, 12, 11, 16, 14, 9, 14, 5, 10, 9, 7, 8, 17, 17, 6, 9, 12, 16, 14, 24, 7, 8, 18, 9, 16, 9, 11, 27, 9, 6, 14, 17, 14, 12, 8, 18, 12, 12, 15, 15, 10, 7, 13, 10, 8, 8, 17, 21]
<start> scene from a basketball game <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

## Checking the Video Encoder

In [15]:
%run VideoEncoder.ipynb

encoding_size = 300

video_encoder = VideoEncoder(encoding_size = encoding_size)
video_encoder.eval()
encoded_videos = video_encoder(videos)

print(encoded_videos.shape)

decoder_state_size = encoded_videos.shape[1]

torch.Size([128, 1500])


## Checking the Text Decoder

In [16]:
%run TextDecoder.ipynb

embedding_size = 300

text_decoder = TextDecoder(input_size = embedding_size, state_size = decoder_state_size, vocab_size = len(tokenizer.vocab))
sample_input_token = texts[:, 1]
batch_size = texts.shape[0]
initial_state = text_decoder.dummy_input_state(batch_size)

prediction, output_state = text_decoder(initial_state, sample_input_token)

print("next token prediction: ", prediction.shape)

next token prediction:  torch.Size([128, 12004])


## Checking the Untrained Video Captioner

In [17]:
!pip install pytorch-lightning --quiet

%run VideoCaptioner.ipynb

def generate_caption(model, video, max_length = 128):

    model.eval()
    
    encoded_video= model.video_encoder(video.unsqueeze(0))
    token_id = tokenizer.vocab.stoi["<start>"]

    token_texts = "<start>"

    for i in range(max_length):
        input_token = torch.LongTensor([token_id])

        if(i==0):
            token_scores, state = model.text_decoder((encoded_video,encoded_video), input_token)
        else:
            token_scores, state = model.text_decoder(state, input_token)  

        _ , token_id = token_scores.squeeze(0).max(0)

        token_text = tokenizer.vocab.itos[token_id]
        token_texts =token_texts + " " + token_text

        if(token_text == '<end>'):
            break

    return token_texts

video_captioner = VideoCaptioner(tokenizer, embedding_size = embedding_size, state_size = decoder_state_size, encoding_size = encoding_size)

video, _ = train_dataset.__getitem__(0)
caption = generate_caption(video_captioner, video)

print(caption)

<start> blossom blossom person person sauce murrey spectacular tune tune champions kitty expanse gonna plain returns statement gonna gonna plain expect mothers mothers dodges tune nacho gonna gonna plain returns statement backed gonna gonna yellow gonna plain expect mothers dodges tune nacho gonna gonna plain expect returns gonna statement gonna plain expect mothers dodges tune nacho gonna gonna plain expect returns gonna statement gonna plain expect mothers dodges tune nacho gonna gonna plain expect returns gonna statement gonna plain expect mothers dodges tune nacho gonna gonna plain expect returns gonna statement gonna plain expect mothers dodges tune nacho gonna gonna plain expect returns gonna statement gonna plain expect mothers dodges tune nacho gonna gonna plain expect returns gonna statement gonna plain expect mothers dodges tune nacho gonna gonna plain


## Training the Video Captioner

In [12]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


%run VideoCaptioner.ipynb


logger = TensorBoardLogger(save_dir="/content/drive/My Drive/Colab Notebooks/Vision Project/Log/", name="my_model")

checkpoint_callback = ModelCheckpoint(
    filepath = '/content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_{epoch:03d}-{val_loss:.2f}',
    verbose = True, monitor = 'val_loss', mode = 'min', save_top_k = 6)

# # Create image captioner.
video_captioner = VideoCaptioner(tokenizer, embedding_size = embedding_size, 
                                 state_size = decoder_state_size, encoding_size = encoding_size)

# # Simple thanks to Pytorch Lighting.
trainer = pl.Trainer(gradient_clip_val = 1.0, gpus = 1, 
                     min_epochs = 2, max_epochs = 5,
                     checkpoint_callback = checkpoint_callback,
                     weights_summary = None, logger=logger) 

# # Train the model.
trainer.fit(video_captioner, 
            train_dataloader = train_data_loader, 
            val_dataloaders = [val_data_loader]) 

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: False, using: 0 TPU cores
INFO:lightning:TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Validation loss 9.38




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0: val_loss reached 9.30402 (best 9.30402), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=000-val_loss=9.30.ckpt as top 6
INFO:lightning:Epoch 0: val_loss reached 9.30402 (best 9.30402), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=000-val_loss=9.30.ckpt as top 6


Validation loss 9.30
Training loss 9.35


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1: val_loss reached 9.19623 (best 9.19623), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=001-val_loss=9.20.ckpt as top 6
INFO:lightning:Epoch 1: val_loss reached 9.19623 (best 9.19623), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=001-val_loss=9.20.ckpt as top 6


Validation loss 9.20
Training loss 9.22


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2: val_loss reached 8.91473 (best 8.91473), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=002-val_loss=8.91.ckpt as top 6
INFO:lightning:Epoch 2: val_loss reached 8.91473 (best 8.91473), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=002-val_loss=8.91.ckpt as top 6


Validation loss 8.91
Training loss 9.02


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3: val_loss reached 7.98364 (best 7.98364), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=003-val_loss=7.98.ckpt as top 6
INFO:lightning:Epoch 3: val_loss reached 7.98364 (best 7.98364), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=003-val_loss=7.98.ckpt as top 6


Validation loss 7.98
Training loss 8.43


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4: val_loss reached 6.95662 (best 6.95662), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=004-val_loss=6.96.ckpt as top 6
INFO:lightning:Epoch 4: val_loss reached 6.95662 (best 6.95662), saving model to /content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=004-val_loss=6.96.ckpt as top 6


Validation loss 6.96
Training loss 7.15



1

## Checking the trained Video Captioner

In [None]:
trained_video_captioner = VideoCaptioner(tokenizer, embedding_size = embedding_size, state_size = decoder_state_size, encoding_size = encoding_size)

checkpoint = torch.load('/content/drive/My Drive/Colab Notebooks/Vision Project/checkpoints/video_captioner_weights_epoch=004-val_loss=6.75.ckpt')
trained_video_captioner.load_state_dict(checkpoint['state_dict'])

for vidId in range(50,100):
  video, _ = val_dataset.__getitem__(vidId)
  caption = generate_caption(trained_video_captioner, video)
  print(val_dataset.videos[vidId], caption)

video5757 <start> a man is singing a song <end>
video2170 <start> a man is walking down a street <end>
video1449 <start> a man is talking to a crowd <end>
video4348 <start> a man is talking to a man <end>
video5799 <start> a man cuts the wood and the ingredients for the <unk> <end>
video2487 <start> a guy is riding a bike on a railroad road <end>
video5437 <start> a man is sketching a cartoon <end>
video3931 <start> a man is talking to a man <end>
video2920 <start> a woman in a violet is catwalking in a room <end>
video1447 <start> a savanna elephant jumps and a zebra <end>
video4100 <start> a woman is singing a song <end>
video5164 <start> a group of boys are singing and playing the guitar <end>
video6075 <start> the girl wore the classroom <end>
video1861 <start> a woman is talking about a woman in a stage <end>
video5061 <start> a girl is waiting for a window <end>
video1558 <start> a woman is talking about a boy <end>
video376 <start> an elephant cartoon dog talking to other cartoo