## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

sys.path.append('../../scr/evaluation')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
valid_paths, valid_descriptions, max_length_valid = get_img_info('valid')
test_paths, test_descriptions, max_length_test = get_img_info('test')
sydney_paths, sydney_descriptions, max_length_sydney = get_img_info('sydney')


In [7]:
train_paths.extend(valid_paths.copy())
train_paths = np.array(train_paths)

train_descriptions.extend(valid_descriptions.copy())
train_descriptions = np.array(train_descriptions)

captions = train_descriptions.copy()
max_length_all = max(max_length_train, max_length_valid)
max_length = max_length_all + 2
      
lex = set()
for sen in train_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in train_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(train_paths)} images for training') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')
print(f'The maximum length of captions with start and stop token in test is {max_length_test}.')
print(f'The maximum length of captions with start and stop token in the sydney dataset is {max_length_sydney}.')


There are 10416 images for training
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.
The maximum length of captions with start and stop token in test is 30.
The maximum length of captions with start and stop token in the sydney dataset is 20.


In [8]:
train_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
train_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Glove Embeddings

In [10]:
embeddings_index = {} 
path = os.path.join(root_captioning, 'glove.6B.200d.txt') if AWS\
else os.path.join(root_captioning, 'raw', 'glove.6B.200d.txt')

f = open(
    path, 
    encoding="utf-8"
)

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:22, 17988.47it/s]

Found 400000 word vectors.





In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count =0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, cnn_type, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        if cnn_type == 'vgg16':
            self.model = models.vgg16(pretrained=pretrained)

            # remove the last two layers in classifier
            self.model.classifier = nn.Sequential(
              *list(self.model.classifier.children())[:-2]
            )
            self.input_size = 224     

        # inception v3 expects (299, 299) sized images
        elif cnn_type == 'inception_v3':
            self.model = models.inception_v3(pretrained=pretrained)
            # remove the classification layer
            self.model.fc = nn.Identity()

            # turn off auxiliary output
            self.model.aux_logits = False
            self.input_size = 299

        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
            # set the model to evaluation model
            self.model.eval()

        return self.model(img_input)

In [14]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
              'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [15]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

### Train the Neural Network

In [16]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        )
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [17]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
        ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx]
        img_features, captions = [], []
        for desc in self.descriptions[idx]:
            # convert each word into a list of sequences.
            seq = [self.wordtoidx[word] for word in desc.split(' ')
                  if word in self.wordtoidx]
            # pad the sequence with 0 on the right side
            in_seq = np.pad(
                seq, 
                (0, max_length - len(seq)),
                mode='constant',
                constant_values=(0, 0)
                )

            img_features.append(img)
            captions.append(in_seq)
    
        return (img_features, captions)


In [18]:
def my_collate(batch):
    """
    Processes the batch to return from the dataloader

    Parameters:
    -----------
    batch: tuple
      a batch from the Dataset

    Return:
    --------
    list
        [image feature matrix, captions, the length of each caption]
    """  

    img_features = [item[0] for item in batch]
    captions = [item[1] for item in batch]

    img_features = torch.FloatTensor(list(chain(*img_features)))
    captions = torch.LongTensor(list(chain(*captions)))

    return [img_features, captions]

In [19]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [20]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [21]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in img_paths:
        img_features.append(
            encode_image(model, image_path).cpu().data.numpy()
    )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [22]:
def get_train_test(
    encoder,
    train_paths,
    test_paths,
    sydney_paths
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    
    sydney_img_features = extract_img_features(
        sydney_paths,
        encoder
    )
    
    return train_img_features, test_img_features, sydney_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size,
        collate_fn=my_collate
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    cnn_type='inception_v3',
    hidden_size=256,
):

    caption_model = CaptionModel(
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=hidden_size,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [23]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [24]:

def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [25]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [26]:
cnn_type = 'vgg16'
encoder = CNNModel(cnn_type, pretrained=True)
encoder.to(device)

CNNModel(
  (model): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      (16):

In [27]:
print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')

vocab = get_vocab(train_descriptions, word_count_threshold=10)
idxtoword, wordtoidx = get_word_dict(vocab)
vocab_size = get_vocab_size(idxtoword)
embedding_dim = 200
embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

print(f'Preparing dataloader...')
train_img_features, test_img_features, sydney_img_features = get_train_test(encoder, train_paths, test_paths, sydney_paths)

train_loader = get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=1000
)

print(f'Training...')
caption_model = train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    cnn_type=cnn_type
)


10416 images for training and 2605 images for testing.
There are 52080 captions
preprocessed words 2917 ==> 991
The vocabulary size is 992.
890 out of 992 words are found in the pre-trained matrix.
The size of embedding_matrix is (992, 200)
Preparing dataloader...

Generating set took: 0:01:11.37

Generating set took: 0:00:17.95


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:04.41
Training...


  2%|▏         | 1/60 [00:28<28:27, 28.94s/it]

6.986705910075795


  3%|▎         | 2/60 [00:58<28:10, 29.14s/it]

5.641460332003507


  5%|▌         | 3/60 [01:28<27:56, 29.42s/it]

4.622280207547274


  7%|▋         | 4/60 [01:59<27:47, 29.77s/it]

4.134577642787587


  8%|▊         | 5/60 [02:29<27:19, 29.81s/it]

3.300137064673684


 10%|█         | 6/60 [02:58<26:49, 29.80s/it]

2.649696566841819


 12%|█▏        | 7/60 [03:27<26:08, 29.59s/it]

2.2037516940723765


 13%|█▎        | 8/60 [03:56<25:29, 29.41s/it]

1.883839563889937


 15%|█▌        | 9/60 [04:26<25:03, 29.48s/it]

1.6877753951332786


 17%|█▋        | 10/60 [04:56<24:42, 29.65s/it]

1.5638676448301836


 18%|█▊        | 11/60 [05:27<24:35, 30.11s/it]

1.4690549265254627


 20%|██        | 12/60 [05:57<24:03, 30.07s/it]

1.3992008404298262


 22%|██▏       | 13/60 [06:26<23:20, 29.79s/it]

1.346172571182251


 23%|██▎       | 14/60 [06:56<22:40, 29.57s/it]

1.305475722659718


 25%|██▌       | 15/60 [07:24<22:00, 29.35s/it]

1.2694279497320002


 27%|██▋       | 16/60 [07:53<21:25, 29.21s/it]

1.2390934228897095


 28%|██▊       | 17/60 [08:22<20:52, 29.13s/it]

1.2136272517117588


 30%|███       | 18/60 [08:51<20:20, 29.05s/it]

1.1878531737761064


 32%|███▏      | 19/60 [09:20<19:48, 29.00s/it]

1.1667547876184636


 33%|███▎      | 20/60 [09:49<19:19, 28.99s/it]

1.149034245447679


 35%|███▌      | 21/60 [10:18<18:55, 29.13s/it]

1.1306906992738897


 37%|███▋      | 22/60 [10:49<18:48, 29.69s/it]

1.1128575151616877


 38%|███▊      | 23/60 [11:23<19:00, 30.82s/it]

1.0956899144432761


 40%|████      | 24/60 [11:53<18:25, 30.71s/it]

1.0793440829623828


 42%|████▏     | 25/60 [12:22<17:36, 30.18s/it]

1.0670805194161155


 43%|████▎     | 26/60 [12:51<16:51, 29.76s/it]

1.0533976609056646


 45%|████▌     | 27/60 [13:19<16:07, 29.32s/it]

1.0436375086957759


 47%|████▋     | 28/60 [13:47<15:21, 28.78s/it]

1.0344782146540554


 48%|████▊     | 29/60 [14:14<14:39, 28.37s/it]

1.0258034846999429


 50%|█████     | 30/60 [14:42<14:02, 28.08s/it]

1.0181232365694912


 52%|█████▏    | 31/60 [15:09<13:30, 27.95s/it]

1.008523328737779


 53%|█████▎    | 32/60 [15:38<13:05, 28.05s/it]

0.9980599338358099


 55%|█████▌    | 33/60 [16:06<12:39, 28.13s/it]

0.9863438931378451


 57%|█████▋    | 34/60 [16:35<12:18, 28.41s/it]

0.976820474321192


 58%|█████▊    | 35/60 [17:04<11:52, 28.51s/it]

0.9674455740235068


 60%|██████    | 36/60 [17:31<11:16, 28.19s/it]

0.9628600803288546


 62%|██████▏   | 37/60 [17:58<10:42, 27.94s/it]

0.9600408023053949


 63%|██████▎   | 38/60 [18:26<10:11, 27.80s/it]

0.9547066363421354


 65%|██████▌   | 39/60 [18:53<09:41, 27.68s/it]

0.9472402496771379


 67%|██████▋   | 40/60 [19:21<09:12, 27.63s/it]

0.9426986250010404


 68%|██████▊   | 41/60 [19:48<08:43, 27.55s/it]

0.9340921477837996


 70%|███████   | 42/60 [20:16<08:14, 27.50s/it]

0.9216468442570079


 72%|███████▏  | 43/60 [20:43<07:47, 27.49s/it]

0.9150134758515791


 73%|███████▎  | 44/60 [21:10<07:19, 27.45s/it]

0.9088022167032416


 75%|███████▌  | 45/60 [21:38<06:51, 27.46s/it]

0.9053664261644537


 77%|███████▋  | 46/60 [22:08<06:34, 28.20s/it]

0.9021829475056041


 78%|███████▊  | 47/60 [22:38<06:15, 28.92s/it]

0.8977626941420815


 80%|████████  | 48/60 [23:07<05:47, 28.95s/it]

0.8896228725259955


 82%|████████▏ | 49/60 [23:36<05:18, 28.91s/it]

0.8817734068090265


 83%|████████▎ | 50/60 [24:04<04:45, 28.52s/it]

0.8767599029974504


 85%|████████▌ | 51/60 [24:32<04:14, 28.25s/it]

0.8740483305670999


 87%|████████▋ | 52/60 [24:59<03:44, 28.05s/it]

0.8734971176494252


 88%|████████▊ | 53/60 [25:27<03:16, 28.04s/it]

0.8719697377898477


 90%|█████████ | 54/60 [25:55<02:48, 28.06s/it]

0.8736356496810913


 92%|█████████▏| 55/60 [26:23<02:20, 28.02s/it]

0.8732447570020502


 93%|█████████▎| 56/60 [26:51<01:51, 27.98s/it]

0.8702158656987277


 95%|█████████▌| 57/60 [27:19<01:23, 27.93s/it]

0.8657121387394991


 97%|█████████▋| 58/60 [27:47<00:55, 27.92s/it]

0.855675301768563


 98%|█████████▊| 59/60 [28:15<00:27, 27.93s/it]

0.8466085845773871


100%|██████████| 60/60 [28:43<00:00, 28.72s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.8382724902846597


  2%|▏         | 1/60 [00:27<27:25, 27.88s/it]

0.8248949213461443


  3%|▎         | 2/60 [00:55<27:00, 27.95s/it]

0.8073273030194369


  5%|▌         | 3/60 [01:25<26:51, 28.27s/it]

0.7950521924278953


  7%|▋         | 4/60 [01:53<26:31, 28.42s/it]

0.7894403771920637


  8%|▊         | 5/60 [02:23<26:27, 28.86s/it]

0.7857033772902056


 10%|█         | 6/60 [02:53<26:19, 29.24s/it]

0.7833429195664146


 12%|█▏        | 7/60 [03:21<25:29, 28.85s/it]

0.7809681729836897


 13%|█▎        | 8/60 [03:49<24:46, 28.59s/it]

0.7803738442334262


 15%|█▌        | 9/60 [04:17<24:07, 28.38s/it]

0.7794982465830717


 17%|█▋        | 10/60 [04:45<23:33, 28.27s/it]

0.778536704453555


 18%|█▊        | 11/60 [05:13<22:58, 28.14s/it]

0.778239515694705


 20%|██        | 12/60 [05:41<22:27, 28.08s/it]

0.7764569304206155


 22%|██▏       | 13/60 [06:09<21:56, 28.02s/it]

0.7769675254821777


 23%|██▎       | 14/60 [06:37<21:28, 28.01s/it]

0.7758445902304216


 25%|██▌       | 15/60 [07:05<20:57, 27.95s/it]

0.775017191063274


 27%|██▋       | 16/60 [07:32<20:26, 27.86s/it]

0.7747237411412325


 28%|██▊       | 17/60 [08:00<19:59, 27.90s/it]

0.7739710699428212


 30%|███       | 18/60 [08:28<19:29, 27.84s/it]

0.7735469558022239


 32%|███▏      | 19/60 [08:56<18:58, 27.77s/it]

0.772324724630876


 33%|███▎      | 20/60 [09:23<18:28, 27.72s/it]

0.7729588367722251


 35%|███▌      | 21/60 [09:51<18:01, 27.73s/it]

0.7703528350049799


 37%|███▋      | 22/60 [10:19<17:37, 27.83s/it]

0.7718556414950978


 38%|███▊      | 23/60 [10:47<17:10, 27.85s/it]

0.7712763710455461


 40%|████      | 24/60 [11:15<16:43, 27.89s/it]

0.7721500830216841


 42%|████▏     | 25/60 [11:43<16:17, 27.93s/it]

0.7701637853275646


 43%|████▎     | 26/60 [12:11<15:49, 27.93s/it]

0.7703721794215116


 45%|████▌     | 27/60 [12:39<15:20, 27.90s/it]

0.7690233371474526


 47%|████▋     | 28/60 [13:07<14:53, 27.91s/it]

0.7690224972638217


 48%|████▊     | 29/60 [13:34<14:25, 27.90s/it]

0.768367740240964


 50%|█████     | 30/60 [14:02<13:54, 27.82s/it]

0.7688786441629584


 52%|█████▏    | 31/60 [14:30<13:24, 27.74s/it]

0.7681280482899059


 53%|█████▎    | 32/60 [14:57<12:56, 27.72s/it]

0.7683824788440358


 55%|█████▌    | 33/60 [15:25<12:31, 27.84s/it]

0.7667859792709351


 57%|█████▋    | 34/60 [15:53<12:05, 27.90s/it]

0.7676401896910234


 58%|█████▊    | 35/60 [16:22<11:38, 27.95s/it]

0.767101379958066


 60%|██████    | 36/60 [16:49<11:09, 27.89s/it]

0.765054009177468


 62%|██████▏   | 37/60 [17:17<10:39, 27.80s/it]

0.7665565447373823


 63%|██████▎   | 38/60 [17:45<10:11, 27.80s/it]

0.766265099698847


 65%|██████▌   | 39/60 [18:13<09:44, 27.84s/it]

0.7651804252104326


 67%|██████▋   | 40/60 [18:41<09:17, 27.88s/it]

0.7663730653849515


 68%|██████▊   | 41/60 [19:08<08:49, 27.87s/it]

0.7655451460318132


 70%|███████   | 42/60 [19:36<08:22, 27.91s/it]

0.7641839710148898


 72%|███████▏  | 43/60 [20:04<07:54, 27.91s/it]

0.7639652436429804


 73%|███████▎  | 44/60 [20:32<07:26, 27.93s/it]

0.7638815424659036


 75%|███████▌  | 45/60 [21:00<06:58, 27.92s/it]

0.7637413523413918


 77%|███████▋  | 46/60 [21:28<06:31, 27.95s/it]

0.763420126654885


 78%|███████▊  | 47/60 [21:56<06:03, 27.93s/it]

0.7635381872003729


 80%|████████  | 48/60 [22:24<05:35, 27.92s/it]

0.7632543661377647


 82%|████████▏ | 49/60 [22:52<05:07, 27.99s/it]

0.7635138847611167


 83%|████████▎ | 50/60 [23:20<04:39, 28.00s/it]

0.763086969202215


 85%|████████▌ | 51/60 [23:48<04:11, 27.97s/it]

0.7639603614807129


 87%|████████▋ | 52/60 [24:16<03:43, 27.94s/it]

0.7627694390036843


 88%|████████▊ | 53/60 [24:44<03:15, 27.87s/it]

0.7620602412657305


 90%|█████████ | 54/60 [25:12<02:47, 27.90s/it]

0.7619082060727206


 92%|█████████▏| 55/60 [25:40<02:19, 27.90s/it]

0.7615184350447222


 93%|█████████▎| 56/60 [26:08<01:51, 27.94s/it]

0.7610311345620588


 95%|█████████▌| 57/60 [26:36<01:23, 27.95s/it]

0.7615412961352955


 97%|█████████▋| 58/60 [27:03<00:55, 27.93s/it]

0.761596983129328


 98%|█████████▊| 59/60 [27:31<00:27, 27.85s/it]

0.7612750476056879


100%|██████████| 60/60 [27:59<00:00, 27.99s/it]

0.75925213098526





In [28]:
model_score = {}

In [29]:
model_score['test'] = evaluate_results(
    test_img_features, 
    caption_model,
    test_descriptions,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)

Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [30]:
model_score['sydney'] = evaluate_results(
    sydney_img_features, 
    caption_model,
    sydney_descriptions,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)

Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [31]:
model_score

{'test': {'Bleu_1': 0.6476930512696343,
  'Bleu_2': 0.5231829864964733,
  'Bleu_3': 0.4404955902262122,
  'Bleu_4': 0.38087148045821484,
  'METEOR': 0.30011708153642863,
  'ROUGE_L': 0.5526081116162016,
  'CIDEr': 2.125320201509738,
  'SPICE': 0.3995357754273313,
  'USC_similarity': 0.611530009988631},
 'sydney': {'Bleu_1': 0.4526575944638088,
  'Bleu_2': 0.21959057943509988,
  'Bleu_3': 0.11705216703768609,
  'Bleu_4': 0.07174433505019233,
  'METEOR': 0.14502371321555516,
  'ROUGE_L': 0.28994892087109475,
  'CIDEr': 0.2103049305235536,
  'SPICE': 0.11863333344610753,
  'USC_similarity': 0.45795351105998916}}

In [32]:
tag = '9.1.3.2'
with open(f'{root_captioning}/fz_notebooks/final_results_n{tag}.json', 'w') as fp:
    json.dump(model_score, fp)