In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from dataset import ImageCaption
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from models import *
import numpy as np
from torchtext.data.metrics import bleu_score
from utils import *
from torchtext.data.utils import get_tokenizer
import imageio


input_size = 224
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}


tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
captions_vocab = build_vocab("image_captions/image_mapping and captions/12/captions.txt", tokenizer)

image_cap_dataset_train = ImageCaption("image_captions/image_mapping and captions/12/image_names.txt", 
                                        "image_captions/image_mapping and captions/12/captions.txt",
                                        captions_vocab,data_transforms["train"],"train")
image_cap_dataset_test = ImageCaption("image_captions/image_mapping and captions/12/image_names.txt",
                                        "image_captions/image_mapping and captions/12/captions.txt",
                                        captions_vocab,data_transforms["val"],"val")


image_cap_train_dataloader  = DataLoader(image_cap_dataset_train, batch_size=16,num_workers=12, shuffle=True)
image_capt_test_dataloader  = DataLoader(image_cap_dataset_test, batch_size=16,num_workers=12, shuffle=False)

itos = captions_vocab.get_itos() 

embed_size = 300
hidden_size = 128
lr = 3e-4
MAX_EPOCHS = 500

encoder = torch.load("weights/encoder.pth")
decoder = torch.load("weights/decoder.pth")


In [10]:
with torch.no_grad():
    for i, (images,embedding_vector,token_numbers,lengths) in enumerate(image_capt_test_dataloader):
        # Set mini-batch dataset
        images = images.to(device)
        embedding_vector = embedding_vector.to(device)
        token_numbers = token_numbers.to(device)
        
        features = encoder(images)
        outputs = decoder(features,token_numbers)
        print(outputs.shape)
        _, predicted = outputs.max(2)
        for i in range(outputs.shape[0]):
            caption = []
            gt_caption = []
            for tokens in predicted[i]:
                caption.append(itos[tokens])
            for tokens in token_numbers[i]:
                gt_caption.append(itos[tokens])
                
            print("Predicted Caption: ",' '.join(caption))
            print("Ground Truth Caption: ",' '.join(gt_caption))

torch.Size([16, 20, 9213])
Predicted Caption:  Three group of people in sitting in front of a building . <eos> <eos> <eos> of of of of of
Ground Truth Caption:  <bos> A group of women are standing in front of a building . <eos> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted Caption:  A people on riders a hill . <eos> <eos> <eos> of of of of of of of of of of
Ground Truth Caption:  <bos> Two friends bike down a hill . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted Caption:  A man with watched up a woman . <eos> <eos> <eos> <eos> <eos> . . . . of of of
Ground Truth Caption:  <bos> Eldery man being interviewed after a race . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted Caption:  A person dog is running on catch a ball . <eos> <eos> <eos> <eos> . . of of of of
Ground Truth Caption:  <bos> A black dog is poised to catch a Frisbee . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted Caption:  A man in a white shirt

In [4]:
captions_vocab["<bos>"]

2

In [7]:
with torch.no_grad():  
    # set the evaluation mode
    encoder.eval()
    decoder.eval()

    for i, (images,embedding_vector,token_numbers,lengths) in enumerate(image_capt_test_dataloader):
        # Set mini-batch dataset
        images = images.to(device)
        embedding_vector = embedding_vector.to(device)
        token_numbers = token_numbers.to(device)
        
        # Forward, backward and optimize
        features = encoder(images)
        outputs = decoder.sample(features)
        images = images.cpu().numpy()
        images = np.moveaxis(images, 1, -1)
        for i in range(outputs.shape[0]):
            caption = []
            gt_caption = []
            for tokens in outputs[i]:
                caption.append(itos[tokens])
            for tokens in token_numbers[i]:
                gt_caption.append(itos[tokens])
            #imageio.imwrite("results/{}.jpg".format(i), images[i])
            print(caption)
            print(gt_caption)
            
        break       

['<bos>', 'A', 'woman', 'in', 'a', 'black', 'shirt', 'and', 'a', 'woman', 'are', 'sitting', 'on', 'a', 'bench', '.', '<eos>', '<eos>', '<eos>', '.', '<eos>']
['<bos>', 'A', 'group', 'of', 'women', 'are', 'standing', 'in', 'front', 'of', 'a', 'building', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'A', 'man', 'in', 'a', 'red', 'shirt', 'is', 'jumping', 'off', 'a', 'rock', '.', '<eos>', '<eos>', '<eos>', '.', '<eos>', '<eos>', '<eos>', '.']
['<bos>', 'Man', 'and', 'woman', 'cyclists', 'ride', 'pass', 'signs', 'on', 'rural', 'road', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'A', 'man', 'in', 'a', 'red', 'shirt', 'and', 'a', 'woman', 'are', 'sitting', 'on', 'a', 'bench', '.', '<eos>', '<eos>', '<eos>', '.', '<eos>']
['<bos>', 'One', 'older', 'man', 'interviews', 'another', 'in', 'a', 'racing', 'suit', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'A', 'man', 'in'