## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

sys.path.append('../../scr/evaluation')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess

START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
valid_paths, valid_descriptions, max_length_valid = get_img_info('valid')
test_paths, test_descriptions, max_length_test = get_img_info('test')
sydney_paths, sydney_descriptions, max_length_sydney = get_img_info('sydney')


In [7]:
train_paths.extend(valid_paths.copy())
train_paths = np.array(train_paths)

train_descriptions.extend(valid_descriptions.copy())
train_descriptions = np.array(train_descriptions)

captions = train_descriptions.copy()
max_length_all = max(max_length_train, max_length_valid)
max_length = max_length_all + 2
      
lex = set()
for sen in train_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in train_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(train_paths)} images for training') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')
print(f'The maximum length of captions with start and stop token in test is {max_length_test}.')
print(f'The maximum length of captions with start and stop token in the sydney dataset is {max_length_sydney}.')


There are 10416 images for training
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.
The maximum length of captions with start and stop token in test is 30.
The maximum length of captions with start and stop token in the sydney dataset is 20.


In [8]:
train_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
train_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Wikipedia2vec Embeddings

In [10]:
# read the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
    embeddings_index = json.load(file)

In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count =0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        # inception v3 expects (299, 299) sized images
        self.model = models.inception_v3(pretrained=pretrained, aux_logits=False)
        # remove the classification layer
        self.model =\
        nn.Sequential(
            *(list(self.model.children())[: 3]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[3: 5]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[5: -1])
        )

        self.input_size = 299

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
          # set the model to evaluation model
          self.model.eval()

        # N x 3 x 299 x 299
        features = self.model(img_input)
        # N x 2048 x 8 x 8

        return features

In [14]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
              'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [15]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

### Train the Neural Network

In [16]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        )
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [17]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img, in_seq


In [18]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [19]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [20]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in img_paths:
        img_features.append(
            F.adaptive_avg_pool2d(
                (encode_image(model, image_path).cpu()), 
                (1, 1)
            ).squeeze().data.numpy()
        )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [21]:
def get_train_test(
    encoder,
    train_paths,
    test_paths,
    sydney_paths
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    
    sydney_img_features = extract_img_features(
        sydney_paths,
        encoder
    )
    
    return train_img_features, test_img_features, sydney_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    cnn_type='inception_v3',
    hidden_size=256,
):

    caption_model = CaptionModel(
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=hidden_size,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 7)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 7)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [22]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [23]:


def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [24]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [25]:
cnn_type = 'inception_v3'
encoder = CNNModel(pretrained=True)
encoder.to(device)

CNNModel(
  (model): Sequential(
    (0): BasicConv2d(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicConv2d(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): BasicConv2d(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): BasicConv2d(
      (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): BasicConv2d(
      (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1

In [26]:
print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')

vocab = get_vocab(train_descriptions, word_count_threshold=10)
idxtoword, wordtoidx = get_word_dict(vocab)
vocab_size = get_vocab_size(idxtoword)
embedding_dim = 500
embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

print(f'Preparing dataloader...')
train_img_features, test_img_features, sydney_img_features = get_train_test(encoder, train_paths, test_paths, sydney_paths)

train_loader = get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=1000
)

print(f'Training...')
caption_model = train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix
)


10416 images for training and 2605 images for testing.
There are 52080 captions
preprocessed words 2917 ==> 991
The vocabulary size is 992.
887 out of 992 words are found in the pre-trained matrix.
The size of embedding_matrix is (992, 500)
Preparing dataloader...

Generating set took: 0:04:04.58

Generating set took: 0:01:00.61


  0%|          | 0/70 [00:00<?, ?it/s]


Generating set took: 0:00:14.38
Training...


  1%|▏         | 1/70 [00:01<01:12,  1.05s/it]

6.144771792671897


  3%|▎         | 2/70 [00:02<01:11,  1.05s/it]

4.717282338575884


  4%|▍         | 3/70 [00:03<01:11,  1.07s/it]

4.155626036904075


  6%|▌         | 4/70 [00:04<01:10,  1.07s/it]

3.489454312758012


  7%|▋         | 5/70 [00:05<01:09,  1.07s/it]

2.9226139241998847


  9%|▊         | 6/70 [00:06<01:08,  1.07s/it]

2.5616491491144355


 10%|█         | 7/70 [00:07<01:07,  1.06s/it]

2.2833782109347256


 11%|█▏        | 8/70 [00:08<01:05,  1.06s/it]

2.080265088514848


 13%|█▎        | 9/70 [00:09<01:05,  1.07s/it]

1.9154169234362515


 14%|█▍        | 10/70 [00:10<01:03,  1.06s/it]

1.7813363725488836


 16%|█▌        | 11/70 [00:11<01:02,  1.06s/it]

1.6867456978017634


 17%|█▋        | 12/70 [00:12<01:01,  1.05s/it]

1.5684838403354993


 19%|█▊        | 13/70 [00:13<00:57,  1.01s/it]

1.4671277891505847


 20%|██        | 14/70 [00:14<00:54,  1.03it/s]

1.3899084004488858


 21%|██▏       | 15/70 [00:15<00:51,  1.07it/s]

1.318784074349837


 23%|██▎       | 16/70 [00:16<00:49,  1.08it/s]

1.2536017352884465


 24%|██▍       | 17/70 [00:17<00:47,  1.11it/s]

1.2146759954365818


 26%|██▌       | 18/70 [00:18<00:46,  1.11it/s]

1.1883321458643132


 27%|██▋       | 19/70 [00:18<00:45,  1.12it/s]

1.1444437503814697


 29%|██▊       | 20/70 [00:19<00:44,  1.13it/s]

1.1011364622549578


 30%|███       | 21/70 [00:20<00:43,  1.14it/s]

1.0729825063185259


 31%|███▏      | 22/70 [00:21<00:42,  1.14it/s]

1.033360773866827


 33%|███▎      | 23/70 [00:22<00:43,  1.08it/s]

1.0014315084977583


 34%|███▍      | 24/70 [00:23<00:42,  1.08it/s]

0.9732392755421725


 36%|███▌      | 25/70 [00:24<00:41,  1.08it/s]

0.9526470032605258


 37%|███▋      | 26/70 [00:25<00:42,  1.05it/s]

0.9202336668968201


 39%|███▊      | 27/70 [00:26<00:41,  1.02it/s]

0.8834622610699047


 40%|████      | 28/70 [00:27<00:41,  1.01it/s]

0.8540808124975725


 41%|████▏     | 29/70 [00:28<00:41,  1.00s/it]

0.8327025391838767


 43%|████▎     | 30/70 [00:29<00:40,  1.01s/it]

0.8062650723890825


 44%|████▍     | 31/70 [00:30<00:39,  1.02s/it]

0.7919392639940436


 46%|████▌     | 32/70 [00:31<00:39,  1.04s/it]

0.7730204354632985


 47%|████▋     | 33/70 [00:32<00:38,  1.04s/it]

0.7699098153547808


 49%|████▊     | 34/70 [00:33<00:37,  1.05s/it]

0.748785522851077


 50%|█████     | 35/70 [00:34<00:36,  1.04s/it]

0.73697260834954


 51%|█████▏    | 36/70 [00:35<00:35,  1.04s/it]

0.7207985357804731


 53%|█████▎    | 37/70 [00:36<00:34,  1.05s/it]

0.7100134166804227


 54%|█████▍    | 38/70 [00:37<00:33,  1.06s/it]

0.7067332430319353


 56%|█████▌    | 39/70 [00:39<00:33,  1.06s/it]

0.7017256698825143


 57%|█████▋    | 40/70 [00:40<00:31,  1.06s/it]

0.6803824549371545


 59%|█████▊    | 41/70 [00:41<00:30,  1.06s/it]

0.6645869395949624


 60%|██████    | 42/70 [00:42<00:29,  1.07s/it]

0.6530081331729889


 61%|██████▏   | 43/70 [00:43<00:28,  1.06s/it]

0.6402730562470176


 63%|██████▎   | 44/70 [00:44<00:27,  1.06s/it]

0.6376361467621543


 64%|██████▍   | 45/70 [00:45<00:26,  1.04s/it]

0.6211198757995259


 66%|██████▌   | 46/70 [00:46<00:23,  1.00it/s]

0.61204240267927


 67%|██████▋   | 47/70 [00:47<00:22,  1.03it/s]

0.6109416647390886


 69%|██████▊   | 48/70 [00:48<00:21,  1.02it/s]

0.6186069737781178


 70%|███████   | 49/70 [00:49<00:20,  1.05it/s]

0.6243420161984183


 71%|███████▏  | 50/70 [00:49<00:18,  1.06it/s]

0.6201976293867285


 73%|███████▎  | 51/70 [00:50<00:17,  1.09it/s]

0.6146498674696143


 74%|███████▍  | 52/70 [00:51<00:16,  1.10it/s]

0.601224498315291


 76%|███████▌  | 53/70 [00:52<00:15,  1.11it/s]

0.5993887456980619


 77%|███████▋  | 54/70 [00:53<00:14,  1.12it/s]

0.6036182343959808


 79%|███████▊  | 55/70 [00:54<00:13,  1.12it/s]

0.6002743488008325


 80%|████████  | 56/70 [00:55<00:12,  1.12it/s]

0.5983415462753989


 81%|████████▏ | 57/70 [00:56<00:11,  1.12it/s]

0.5980363379825245


 83%|████████▎ | 58/70 [00:57<00:10,  1.13it/s]

0.6029578826644204


 84%|████████▍ | 59/70 [00:57<00:09,  1.13it/s]

0.6091209839690815


 86%|████████▌ | 60/70 [00:58<00:08,  1.13it/s]

0.5928441882133484


 87%|████████▋ | 61/70 [00:59<00:07,  1.13it/s]

0.575571290471337


 89%|████████▊ | 62/70 [01:00<00:07,  1.12it/s]

0.5637543580748818


 90%|█████████ | 63/70 [01:01<00:06,  1.10it/s]

0.5574734021316875


 91%|█████████▏| 64/70 [01:02<00:05,  1.11it/s]

0.5589107735590502


 93%|█████████▎| 65/70 [01:03<00:04,  1.12it/s]

0.559681464325298


 94%|█████████▍| 66/70 [01:04<00:03,  1.13it/s]

0.5563299005681818


 96%|█████████▌| 67/70 [01:05<00:02,  1.12it/s]

0.5500753792849454


 97%|█████████▋| 68/70 [01:05<00:01,  1.12it/s]

0.5534602403640747


 99%|█████████▊| 69/70 [01:06<00:00,  1.11it/s]

0.5585416149009358


100%|██████████| 70/70 [01:07<00:00,  1.03it/s]
  0%|          | 0/70 [00:00<?, ?it/s]

0.5721765025095507


  1%|▏         | 1/70 [00:00<01:01,  1.12it/s]

0.5470661954446272


  3%|▎         | 2/70 [00:01<01:03,  1.07it/s]

0.5202251239256426


  4%|▍         | 3/70 [00:02<01:01,  1.09it/s]

0.49822205846959894


  6%|▌         | 4/70 [00:03<01:00,  1.10it/s]

0.4847752939571034


  7%|▋         | 5/70 [00:04<00:58,  1.11it/s]

0.47925490411845123


  9%|▊         | 6/70 [00:05<00:57,  1.11it/s]

0.4713866114616394


 10%|█         | 7/70 [00:06<00:56,  1.11it/s]

0.46560306440700183


 11%|█▏        | 8/70 [00:07<00:55,  1.12it/s]

0.46526656367562036


 13%|█▎        | 9/70 [00:08<00:54,  1.12it/s]

0.46139034087007696


 14%|█▍        | 10/70 [00:09<00:53,  1.13it/s]

0.4610845717516812


 16%|█▌        | 11/70 [00:09<00:52,  1.13it/s]

0.4565394737503745


 17%|█▋        | 12/70 [00:10<00:51,  1.13it/s]

0.4561064432967793


 19%|█▊        | 13/70 [00:11<00:50,  1.14it/s]

0.4518809752030806


 20%|██        | 14/70 [00:12<00:49,  1.14it/s]

0.45184834978797217


 21%|██▏       | 15/70 [00:13<00:47,  1.15it/s]

0.45047379352829675


 23%|██▎       | 16/70 [00:14<00:47,  1.14it/s]

0.4500622803514654


 24%|██▍       | 17/70 [00:15<00:47,  1.13it/s]

0.44527109644629737


 26%|██▌       | 18/70 [00:16<00:46,  1.12it/s]

0.448048624125394


 27%|██▋       | 19/70 [00:16<00:45,  1.13it/s]

0.4465174133127386


 29%|██▊       | 20/70 [00:17<00:44,  1.12it/s]

0.4437610317360271


 30%|███       | 21/70 [00:18<00:43,  1.11it/s]

0.443713364276019


 31%|███▏      | 22/70 [00:19<00:43,  1.11it/s]

0.4412738247351213


 33%|███▎      | 23/70 [00:20<00:42,  1.11it/s]

0.4396645264192061


 34%|███▍      | 24/70 [00:21<00:41,  1.11it/s]

0.4396377070383592


 36%|███▌      | 25/70 [00:22<00:40,  1.12it/s]

0.4394603371620178


 37%|███▋      | 26/70 [00:23<00:39,  1.12it/s]

0.4390508830547333


 39%|███▊      | 27/70 [00:24<00:40,  1.06it/s]

0.4377767254005779


 40%|████      | 28/70 [00:25<00:38,  1.08it/s]

0.4369401227344166


 41%|████▏     | 29/70 [00:26<00:37,  1.09it/s]

0.4379611855203455


 43%|████▎     | 30/70 [00:26<00:36,  1.11it/s]

0.43643712455576117


 44%|████▍     | 31/70 [00:27<00:34,  1.11it/s]

0.43282700939611957


 46%|████▌     | 32/70 [00:28<00:33,  1.12it/s]

0.43414284695278516


 47%|████▋     | 33/70 [00:29<00:32,  1.13it/s]

0.43204220858487213


 49%|████▊     | 34/70 [00:30<00:32,  1.12it/s]

0.43033877286044037


 50%|█████     | 35/70 [00:31<00:31,  1.13it/s]

0.4303931350057775


 51%|█████▏    | 36/70 [00:32<00:30,  1.13it/s]

0.43093873154033313


 53%|█████▎    | 37/70 [00:33<00:29,  1.14it/s]

0.4287322922186418


 54%|█████▍    | 38/70 [00:33<00:28,  1.14it/s]

0.430727327411825


 56%|█████▌    | 39/70 [00:34<00:27,  1.13it/s]

0.4289875707843087


 57%|█████▋    | 40/70 [00:35<00:26,  1.13it/s]

0.4285855997692455


 59%|█████▊    | 41/70 [00:36<00:25,  1.14it/s]

0.42609749057076196


 60%|██████    | 42/70 [00:37<00:24,  1.13it/s]

0.42725886539979413


 61%|██████▏   | 43/70 [00:38<00:23,  1.13it/s]

0.4273653599348935


 63%|██████▎   | 44/70 [00:39<00:23,  1.12it/s]

0.42622183940627356


 64%|██████▍   | 45/70 [00:40<00:22,  1.11it/s]

0.4262649796225808


 66%|██████▌   | 46/70 [00:41<00:21,  1.12it/s]

0.4259917302565141


 67%|██████▋   | 47/70 [00:42<00:20,  1.12it/s]

0.42583098194815894


 69%|██████▊   | 48/70 [00:42<00:19,  1.12it/s]

0.4252874634482644


 70%|███████   | 49/70 [00:43<00:18,  1.13it/s]

0.42408566312356427


 71%|███████▏  | 50/70 [00:44<00:17,  1.13it/s]

0.42264879833568225


 73%|███████▎  | 51/70 [00:45<00:16,  1.13it/s]

0.42145548354495654


 74%|███████▍  | 52/70 [00:46<00:16,  1.08it/s]

0.422636709430001


 76%|███████▌  | 53/70 [00:47<00:15,  1.09it/s]

0.41944586959752167


 77%|███████▋  | 54/70 [00:48<00:14,  1.10it/s]

0.4181920317086307


 79%|███████▊  | 55/70 [00:49<00:13,  1.11it/s]

0.41972836581143463


 80%|████████  | 56/70 [00:50<00:12,  1.11it/s]

0.41913180730559607


 81%|████████▏ | 57/70 [00:51<00:11,  1.11it/s]

0.4185489741238681


 83%|████████▎ | 58/70 [00:51<00:10,  1.11it/s]

0.4192814393477006


 84%|████████▍ | 59/70 [00:52<00:09,  1.11it/s]

0.41767676039175555


 86%|████████▌ | 60/70 [00:53<00:08,  1.12it/s]

0.4164554368365895


 87%|████████▋ | 61/70 [00:54<00:08,  1.12it/s]

0.4173214327205311


 89%|████████▊ | 62/70 [00:55<00:07,  1.13it/s]

0.4168898409063166


 90%|█████████ | 63/70 [00:56<00:06,  1.12it/s]

0.4155697470361536


 91%|█████████▏| 64/70 [00:57<00:05,  1.13it/s]

0.4147597442973744


 93%|█████████▎| 65/70 [00:58<00:04,  1.14it/s]

0.4154845963824879


 94%|█████████▍| 66/70 [00:58<00:03,  1.15it/s]

0.41598619385199115


 96%|█████████▌| 67/70 [00:59<00:02,  1.14it/s]

0.4143033677881414


 97%|█████████▋| 68/70 [01:00<00:01,  1.13it/s]

0.4137286327101968


 99%|█████████▊| 69/70 [01:01<00:00,  1.13it/s]

0.4143595966425809


100%|██████████| 70/70 [01:02<00:00,  1.12it/s]

0.4128087271343578





In [27]:
model_score = {}

In [28]:
model_score['test'] = evaluate_results(
    test_img_features, 
    caption_model,
    test_descriptions,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)

Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [29]:
model_score['sydney'] = evaluate_results(
    sydney_img_features, 
    caption_model,
    sydney_descriptions,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)

Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [30]:
model_score

{'test': {'Bleu_1': 0.5699351500761696,
  'Bleu_2': 0.4360947622577588,
  'Bleu_3': 0.3531366052321552,
  'Bleu_4': 0.29596348044732085,
  'METEOR': 0.2582235083124929,
  'ROUGE_L': 0.4774790794067372,
  'CIDEr': 1.5831119258416706,
  'SPICE': 0.33208785339507374,
  'USC_similarity': 0.559002346442351},
 'sydney': {'Bleu_1': 0.4636139942541719,
  'Bleu_2': 0.22640906404408656,
  'Bleu_3': 0.11403962941980865,
  'Bleu_4': 0.06935513243457564,
  'METEOR': 0.14958503233612616,
  'ROUGE_L': 0.29683147300005247,
  'CIDEr': 0.1694212809754293,
  'SPICE': 0.11322756227632065,
  'USC_similarity': 0.46786322871900693}}

In [31]:
tag = '9.3.2'
with open(f'{root_captioning}/fz_notebooks/final_results_n{tag}.json', 'w') as fp:
    json.dump(model_score, fp)