In [1]:
import torch
from torch import nn
import numpy as np
from collections import OrderedDict
import torchvision.transforms as transforms
import torchvision.transforms._transforms_video as transforms_video
from transformers import AutoTokenizer
from moviepy.editor import *
from tqdm import tqdm
import argparse

from src.data.video_transforms import Permute
from src.models.video_recap import VideoRecap
from src.data.datasets import VideoCaptionDataset, CaptionDataCollator
from src.models.timesformer import SpaceTimeTransformer
from src.models.openai_model import QuickGELU
from src.configs.defaults import defaultConfigs



# Visualize the Video

In [2]:
from IPython.display import Video
Video("assets/example.mp4")

# Clip Captions

In [3]:
# Create model and tokenizer
ckpt_path = 'pretrained_models/videorecap/videorecap_clip.pt'
ckpt = torch.load(ckpt_path, map_location='cpu')
old_args = ckpt['args']
old_args.video_feature_type = 'pixel'  
old_args.num_video_feat=4                     # number of frames per clip caption
crop_size = 224
transform = transforms.Compose([
        Permute([3, 0, 1, 2]),  # T H W C -> C T H W
        transforms.Resize(crop_size),
        transforms.CenterCrop(crop_size),
        transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]),
    ])

tokenizer = AutoTokenizer.from_pretrained(old_args.decoder_name)
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
    state_dict[k.replace('module.', '')] = v

print("=> Creating model")
model = VideoRecap(old_args)
model = model.cuda()
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))

=> Creating model
######USING ATTENTION STYLE:  frozen-in-time
=> Loading CLIP (ViT-B/16) weights
Loading Video encoder from /data/mmiemon/LaVila/pretrained_models/clip_openai_timesformer_base.baseline.ep_0003.pth


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertLMHeadModel: ['distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.tra

Freeze the pretrained parts in Bert: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'ber

In [4]:
# Create dataset from the video
video = VideoFileClip('assets/example.mp4')
print('Video length', video.duration, 'seconds')

video_length = video.duration
caption_duration = 4                              # Extract clip caption at each 4 seconds
old_args.video_loader_type='moviepy'
old_args.chunk_len = -1                           # load from raw video
old_args.video_feature_path = 'assets'            # path to the video folder 
metadata = []  
for i in np.arange(0, video_length, caption_duration):
    metadata.append(['example', i, min(i + caption_duration, video_length)])    # video name is example.mp4 so assuming video id=example
print('number of captions', len(metadata))

old_args.metadata = metadata
dataset = VideoCaptionDataset(old_args, transform=transform)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, 
                                        num_workers=8, pin_memory=True, drop_last=False)
print(len(dataset), len(data_loader))

Video length 708.03 seconds
number of captions 178
178 23


In [5]:
# Caption decoding function
def decode_one(generated_ids, tokenizer):
    if tokenizer.eos_token_id == tokenizer.bos_token_id:
        if tokenizer.eos_token_id in generated_ids[1:].tolist():
            eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1
        else:
            eos_id = len(generated_ids.tolist()) - 1
    elif tokenizer.eos_token_id in generated_ids.tolist():
        eos_id = generated_ids.tolist().index(tokenizer.eos_token_id)
    else:
        eos_id = len(generated_ids.tolist()) - 1
    generated_text_str = tokenizer.decode(generated_ids[1:eos_id].tolist())
    return generated_text_str

In [6]:
captions = {}
with torch.no_grad():
    for data_iter, samples in enumerate(data_loader):
        indices = samples['index']
        if hasattr(model, "vision_model"):
            image = samples["video_features"].permute(0, 2, 1, 3, 4).contiguous().cuda()  # BCTHW -> BTCHW
            samples["video_features"] = model.vision_model.forward_features(image, use_checkpoint=old_args.use_checkpoint, cls_at_last=False)  # NLD
        
        queries = model.map_features(samples)
    
        if old_args.caption_sample == 'multinomial_sample':
            generated_text_ids, ppls = model.generate(
                queries,
                tokenizer,
                do_sample = False,
                max_text_length=old_args.max_gen_tokens,
                num_return_sequences=old_args.caption_num_return_sequences,
            )
            
        for j in range(generated_text_ids.shape[0]):
            sample = dataset.samples[indices[j].item()]
            start_sec = sample[1]
            for k in range(old_args.caption_num_return_sequences):
                jj = j * old_args.caption_num_return_sequences + k
                generated_text_str = decode_one(generated_text_ids[jj], tokenizer).strip()
                captions[start_sec] = sample + [generated_text_str]
                print(sample[1], sample[2], generated_text_str)



0.0 4.0 #C C picks up the nylon of cabbage from the
4.0 8.0 #C C puts the cabbage in the bowl
8.0 12.0 #C C holds the cabbage with her hands
12.0 16.0 #C C puts the nylon on the cabinet
16.0 20.0 #C C picks the nylon from the cabinet
20.0 24.0 #C C moves her hand towards the counter
24.0 28.0 #C C holds the cabbage with both her hands
28.0 32.0 #C C closes the fridge.
32.0 36.0 #C C picks a nylon from the fridge
36.0 40.0 #C C picks the cabbage from the drawer
40.0 44.0 #C C puts the nylon in the drawer
44.0 48.0 #C C picks up a nylon on the cabinet with her left
48.0 52.0 #C C picks the bowl
52.0 56.0 #C C cleans the kitchen table with the napkin.
56.0 60.0 #C C picks up the bowl from the slab.
60.0 64.0 #C C wipes the countertop with the tissue paper
64.0 68.0 #C C drops the chopping board on the kitchen slab.
68.0 72.0 #C C wipes the countertop with the tissue paper
72.0 76.0 #C C picks the bowl from the countertop.
76.0 80.0 #C C puts the bowl in the utensil rack
80.0 84.0 #C C pic

# Extract Features

In [7]:
all_features = {}
with torch.no_grad():
    for data_iter, samples in enumerate(tqdm(data_loader)):
        image = samples["video_features"].permute(0, 2, 1, 3, 4).contiguous().cuda()  # BCTHW -> BTCHW
        features = model.vision_model.forward_features(image, cls_at_last=True)  # NLD
        for j in range(features.shape[0]):
            start_sec = dataset.samples[samples['index'][j].item()][1]
            all_features[start_sec] = features[j].detach().cpu().numpy()
            # print(start_sec, all_features[start_sec].shape)
           
seconds = list(all_features.keys())
seconds.sort()
features = []
for s in seconds:
    features.append(all_features[s])
features = np.stack(features)
print(features.shape)
np.save('assets/example.npy', features)

 70%|████████████████████████████████████████████████████████████████▋                            | 16/23 [00:19<00:02,  2.75it/s]

Error loading example 708.0 708.03


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:27<00:00,  1.22s/it]

(178, 768)





# Segment Descriptions

In [8]:
ckpt_path = 'pretrained_models/videorecap/videorecap_segment.pt'
ckpt = torch.load(ckpt_path, map_location='cpu')
old_args = ckpt['args']
old_args.video_feature_type = 'cls'
old_args.video_feature_path = 'assets'

tokenizer = AutoTokenizer.from_pretrained(old_args.decoder_name)
collator = CaptionDataCollator(tokenizer, max_gen_tokens = old_args.max_gen_tokens,
                                add_bos = True, add_eos = True, pad_token_id = 0)
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
    state_dict[k.replace('module.', '')] = v
    
print("=> Creating model")
model = VideoRecap(old_args)
model = model.cuda()
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


=> Creating model


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertLMHeadModel: ['distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.5.attention.q_lin.bias', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distil

Freeze the pretrained parts in Bert: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'ber

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertLMHeadModel: ['distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.tra

Freeze the pretrained parts in Bert: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'ber

In [9]:
video = VideoFileClip('assets/example.mp4')
print('Video length', video.duration, 'seconds')

old_args.video_feature_path = 'assets' 
video_length = video.duration
segment_step = 180        # Extract one segment description at each 180 sconds
metadata = []
for i in np.arange(0, video_length, segment_step):
    dd = {}
    dd['vid'] = 'example'
    dd['start_sec'] = i
    dd['end_sec'] = min(i+segment_step, video_length)
    dd['captions_pred'] = []
    for s, c in captions.items():
        if c[1]>=dd['start_sec'] and c[2]<=dd['end_sec']:
            dd['captions_pred'].append(c)
    metadata.append(dd)
print('Number of segments', len(metadata))
    
old_args.metadata = metadata
dataset = VideoCaptionDataset(old_args, transform=None)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=False, 
                    collate_fn=collator, num_workers=4, pin_memory=True, drop_last=False)
print(len(dataset), len(data_loader))

Video length 708.03 seconds
Number of segments 4
4 1


In [10]:
segment_descriptions = {}
with torch.no_grad():
    for data_iter, samples in enumerate(data_loader):
        indices = samples['indices']
        if hasattr(model, "vision_model"):
            image = samples["video_features"].permute(0, 2, 1, 3, 4).contiguous().cuda()  # BCTHW -> BTCHW
            samples["video_features"] = model.vision_model.forward_features(image, use_checkpoint=old_args.use_checkpoint, cls_at_last=False)  # NLD
        
        queries = model.map_features(samples)
    
        if old_args.caption_sample == 'multinomial_sample':
            generated_text_ids, ppls = model.generate(
                queries,
                tokenizer,
                do_sample = False,
                max_text_length=old_args.max_gen_tokens,
                num_return_sequences=old_args.caption_num_return_sequences,
            )
            
        for j in range(generated_text_ids.shape[0]):
            sample = dataset.samples[indices[j].item()]
            start_sec = sample['start_sec']
            for k in range(old_args.caption_num_return_sequences):
                jj = j * old_args.caption_num_return_sequences + k
                generated_text_str = decode_one(generated_text_ids[jj], tokenizer).strip()
                segment_descriptions[start_sec] = generated_text_str
                print(sample['start_sec'], sample['end_sec'], generated_text_str)
                

0.0 180.0 C was in a kitchen. C washed a plate in a wash basin. C arranged the plates in a plate rack
180.0 360.0 C was in a kitchen. C mixed a mixture of cheese and flour. C poured the mixture into a plate.
360.0 540.0 C was in a kitchen. C poured oil into a plate. C poured oil into a plate.
540.0 708.03 C was in a kitchen. C fried fish in a frying pan. C wiped her hands with a tissue paper.


# Video Summary

In [11]:
ckpt_path = 'pretrained_models/videorecap/videorecap_video.pt'
ckpt = torch.load(ckpt_path, map_location='cpu')
old_args = ckpt['args']
old_args.video_feature_type = 'cls'
old_args.video_feature_path = 'assets'

tokenizer = AutoTokenizer.from_pretrained(old_args.decoder_name)
collator = CaptionDataCollator(tokenizer, max_gen_tokens = old_args.max_gen_tokens,
                                add_bos = True, add_eos = True, pad_token_id = 0)
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
    state_dict[k.replace('module.', '')] = v
    
print("=> Creating model")
model = VideoRecap(old_args)
model = model.cuda()
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


=> Creating model


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertLMHeadModel: ['distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.5.attention.q_lin.bias', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distil

Freeze the pretrained parts in Bert: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'ber

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertLMHeadModel: ['distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.5.attention.q_lin.bias', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distil

Freeze the pretrained parts in Bert: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'ber

In [12]:
video = VideoFileClip('assets/example.mp4')
print('Video length', video.duration, 'seconds')

old_args.video_feature_path = 'assets' 
video_length = video.duration
metadata = []
dd = {}
dd['vid'] = 'example'
dd['start_sec'] = 0
dd['end_sec'] = video.duration
dd['segment_descriptions_pred'] = []
for s, c in segment_descriptions.items():
    dd['segment_descriptions_pred'].append(c)
metadata.append(dd)
print('Number of segments', len(metadata))
    
old_args.metadata = metadata
dataset = VideoCaptionDataset(old_args, transform=None)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, 
                    collate_fn=collator, num_workers=1, pin_memory=True, drop_last=False)
print(len(dataset), len(data_loader))

Video length 708.03 seconds
Number of segments 1
1 1


In [15]:
video_summary = {}
with torch.no_grad():
    for data_iter, samples in enumerate(data_loader):
        indices = samples['indices']
        if hasattr(model, "vision_model"):
            image = samples["video_features"].permute(0, 2, 1, 3, 4).contiguous().cuda()  # BCTHW -> BTCHW
            samples["video_features"] = model.vision_model.forward_features(image, use_checkpoint=old_args.use_checkpoint, cls_at_last=False)  # NLD
        
        queries = model.map_features(samples)
    
        if old_args.caption_sample == 'multinomial_sample':
            generated_text_ids, ppls = model.generate(
                queries,
                tokenizer,
                do_sample = False,
                max_text_length=old_args.max_gen_tokens,
                num_return_sequences=old_args.caption_num_return_sequences,
            )
            
        for j in range(generated_text_ids.shape[0]):
            sample = dataset.samples[indices[j].item()]
            start_sec = sample['start_sec']
            for k in range(old_args.caption_num_return_sequences):
                jj = j * old_args.caption_num_return_sequences + k
                generated_text_str = decode_one(generated_text_ids[jj], tokenizer).strip()
                video_summary[start_sec] = generated_text_str
                print(generated_text_str)
                

In a kitchen, C prepared a meal. C cut vegetables into a plate, and cooked the meal
