In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append(".")

In [11]:
from torchvision import transforms
from torch.utils.data import DataLoader
from src.data_loader import get_data_loader
from src.utils import Config

from PIL import Image
import numpy as np

import os
import torch
from src.model import EncoderCNN, DecoderRNN

In [18]:
def predict_image_caption(image_file: str, 
                          transform_image: transforms, 
                          test_data_loader: DataLoader,
                          model_encoder: EncoderCNN,
                          model_decoder: DecoderRNN, 
                          device):
    
    assert os.path.exists(image_file), f"Image file: '{image_file}' doesn't not exist."
    PIL_image = Image.open(image_file).convert("RGB")
    transformed_image = transform_image(PIL_image)  
    transformed_image = transformed_image.to(device)
    transformed_image = transformed_image.unsqueeze(dim=0) # convert size [3, 224, 224] -> [1, 3, 224, 224]
    features = model_encoder(transformed_image).unsqueeze(1)
    output = model_decoder.predict_token_ids(features)    
    sentence = process_predicted_tokens(output, test_data_loader)
    
    return sentence

def process_predicted_tokens(output:list, test_data_loader: DataLoader):
    """Map list of token ids to list of corresponding words/tokens
       using the vocabulary dictionary idx2word. 

    :param output: list of predicted token ids
    :type output: list
    :return: list of tokens
    :rtype: list
    """
    words_sequence = []
    
    for i in output:
        if (i == 1):
            continue
        words_sequence.append(test_data_loader.dataset.vocab.idx2word[i])
    
    # words_sequence = words_sequence[1:-1] 
    sentence = ' '.join(words_sequence) 
    # sentence = sentence.capitalize()
    
    return sentence

def main():

    transform_test = transforms.Compose([transforms.Resize(256),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        transforms.Normalize((0.485, 0.456, 0.406),
                                                            (0.229, 0.224, 0.225))
                                        ])

    config = Config("config.yaml")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    test_data_loader = get_data_loader(transform=transform_test,
                                   caption_file=config.CAPTION_FILE,
                                   image_id_file=config.IMAGE_ID_FILE_TEST, 
                                   image_folder=config.IMAGE_DATA_DIR, 
                                   config=config,
                                   mode='test')

    # TODO #2: Specify the saved models to load.
    encoder_file = f"{config.MODEL_DIR}encoder-10.pkl"
    decoder_file = f"{config.MODEL_DIR}decoder-10.pkl"
    
    assert os.path.exists(encoder_file), f"Encoder model: '{encoder_file}' doesn't not exist."
    assert os.path.exists(decoder_file), f"Decoder model: '{decoder_file}' doesn't not exist."

    # TODO #3: Select appropriate values for the Python variables below.
    embed_size = config.IMG_EMBED_SIZE
    hidden_size = config.HIDDEN_SIZE
    vocab_size = len(test_data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()

    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    # map location helps in save and load accross devices (gpu/cpu)
    #torch.load('my_file.pt', map_location=lambda storage, loc: storage)
    encoder.load_state_dict(torch.load(encoder_file, map_location=lambda storage, loc: storage), strict=False)
    decoder.load_state_dict(torch.load(decoder_file, map_location=lambda storage, loc: storage), strict=False)
    print("Model loaded...")

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)
    
    test_image_file = "asset/test_image/man_run.png"
    # test_image_file = "data/test_images/3234115903_f4dfc8fc75.jpg"
    # test_image_file = "../data/test_images/241347760_d44c8d3a01.jpg"
    pred_caption = predict_image_caption(test_image_file, 
                                         transform_image=transform_test, 
                                         test_data_loader=test_data_loader,
                                         model_encoder=encoder, 
                                         model_decoder=decoder, 
                                         device=device)
    
    print(f"Predicted caption: {pred_caption}")

In [19]:
    
main()

Vocabulary successfully loaded from external file output/vocab.pkl...


RuntimeError: Error(s) in loading state_dict for DecoderRNN:
	size mismatch for embedding_layer.weight: copying a param with shape torch.Size([777, 300]) from checkpoint, the shape in current model is torch.Size([2549, 300]).
	size mismatch for linear.weight: copying a param with shape torch.Size([777, 512]) from checkpoint, the shape in current model is torch.Size([2549, 512]).
	size mismatch for linear.bias: copying a param with shape torch.Size([777]) from checkpoint, the shape in current model is torch.Size([2549]).

# Pick random test image

In [36]:
from src.utils import get_training_data
import numpy as np
import shutil

In [21]:
config = Config("config.yaml")

  config = yaml.load(f,) #Loader=yaml.FullLoader


In [22]:
df = get_training_data(config.IMAGE_ID_FILE_TEST, config.CAPTION_FILE)

In [50]:
def pick_random_test_image(df):
    idx = np.random.randint(low=0, high=len(df))
    image_id = df.iloc[idx]["IMAGE_ID"]
    caption = df.iloc[idx]["CAPTION"]
    return image_id, caption


def copy_file_to_correct_folder(image_id: str):
    file_src_path = f"{config.IMAGE_DATA_DIR}{image_id}"
    file_destination_path = f"asset/test_image/{image_id}"
    shutil.copy(file_src_path, file_destination_path)

In [51]:
image_id, caption = pick_random_test_image(df)

In [52]:
copy_file_to_correct_folder(image_id)

In [53]:
caption

'An elderly woman rides a bicycle along a city street .'

# TODO:

- [ ] Pick random 10-20 test image and available `ground truth` caption.
- [ ] Generate prediction and create a simple table:
        IMAGE_ID | TRUE_CAPTION | PRED_CAPTION