# Import necessary libraries


In [1]:
from config import TrainConfig as C
from models.abd_transformer import ABDTransformer
import torch
from utils import dict_to_cls

# Load checkpoint and config


In [None]:
checkpoint = torch.load("checkpoints/best.ckpt", map_location="cpu")
config = dict_to_cls(checkpoint['config'])

In [3]:
from loader.MSVD import MSVD
corpus = MSVD(config)

# Build Models


In [4]:
vocab = corpus.vocab
""" Build Models """
try:
    model = ABDTransformer(vocab, config.feat.size, config.transformer.d_model, config.transformer.d_ff,
                           config.transformer.n_heads, config.transformer.n_layers, config.transformer.dropout,
                           config.feat.feature_mode, n_heads_big=config.transformer.n_heads_big,
                           select_num=config.transformer.select_num)
except:
    model = ABDTransformer(vocab, config.feat.size, config.transformer.d_model, config.transformer.d_ff,
                           config.transformer.n_heads, config.transformer.n_layers, config.transformer.dropout,
                           config.feat.feature_mode, n_heads_big=config.transformer.n_heads_big)
model.load_state_dict(checkpoint['abd_transformer'])
model.device = "cpu"

# Move model to cpu
model = model.to("cpu")

# Load extracted features


In [None]:
# Load saved features
image_feats = torch.load('features/image_feats.pt', map_location="cpu")
motion_feats = torch.load('features/motion_feats.pt', map_location="cpu")
obect_feats = torch.load('features/object_feats.pt', map_location="cpu")
rel_feats = torch.load('features/rel_feats.pt', map_location="cpu")

# # Reshape to 3D tensor, the first dimension is 1
# image_feats = image_feats.unsqueeze(0)
# motion_feats = motion_feats.unsqueeze(0)[:,:,::2]
# obect_feats = obect_feats.unsqueeze(0)
# rel_feats = rel_feats.unsqueeze(0)

print("Image features shape:", image_feats.shape)
print("Motion features shape:", motion_feats.shape)
print("Object features shape:", obect_feats.shape)
print("Relation features shape:", rel_feats.shape)

Image features shape: torch.Size([1, 50, 1536])
Motion features shape: torch.Size([1, 50, 1024])
Object features shape: torch.Size([1, 50, 1028])
Relation features shape: torch.Size([1, 50, 300])


# Inference with beam search


In [6]:
%%time
model.eval()
beam_size = config.beam_size
max_len = config.loader.max_caption_len
feature_mode = config.feat.feature_mode
feats = (image_feats, motion_feats, obect_feats, rel_feats)
with torch.no_grad():
    r2l_captions, l2r_captions = model.beam_search_decode(feats, beam_size, max_len)
    # r2l_captions = [idxs_to_sentence(caption, vocab.idx2word, BOS_idx) for caption in r2l_captions]
    l2r_captions = [" ".join(caption[0].value) for caption in l2r_captions]
    r2l_captions = [" ".join(caption[0].value) for caption in r2l_captions]
    
    print(f"Left to Right Captions: {l2r_captions}")

Left to Right Captions: ['the person is doing the something']
CPU times: user 6.62 s, sys: 0 ns, total: 6.62 s
Wall time: 1.67 s
