## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

sys.path.append('../../scr/evaluation')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
valid_paths, valid_descriptions, max_length_valid = get_img_info('valid')
test_paths, test_descriptions, max_length_test = get_img_info('test')
sydney_paths, sydney_descriptions, max_length_sydney = get_img_info('sydney')


In [7]:
train_paths.extend(valid_paths.copy())
train_paths = np.array(train_paths)

train_descriptions.extend(valid_descriptions.copy())
train_descriptions = np.array(train_descriptions)

captions = train_descriptions.copy()
max_length_all = max(max_length_train, max_length_valid)
max_length = max_length_all + 2
      
lex = set()
for sen in train_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in train_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(train_paths)} images for training') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')
print(f'The maximum length of captions with start and stop token in test is {max_length_test}.')
print(f'The maximum length of captions with start and stop token in the sydney dataset is {max_length_sydney}.')


There are 10416 images for training
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.
The maximum length of captions with start and stop token in test is 30.
The maximum length of captions with start and stop token in the sydney dataset is 20.


In [8]:
train_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
train_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Wikipedia2vec Embeddings

In [10]:
# read the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
    embeddings_index = json.load(file)

In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count =0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        # inception v3 expects (299, 299) sized images
        self.model = models.inception_v3(pretrained=pretrained, aux_logits=False)
        # remove the classification layer
        self.model =\
        nn.Sequential(
            *(list(self.model.children())[: 3]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[3: 5]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[5: -1])
        )

        self.input_size = 299

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
          # set the model to evaluation model
          self.model.eval()

        # N x 3 x 299 x 299
        features = self.model(img_input)
        # N x 2048 x 8 x 8

        return features

In [14]:
class AttentionModel(nn.Module):

    def __init__(self, feature_size, hidden_size=256):
        """
        Initializes a AttentionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(AttentionModel, self).__init__()

        self.W_a = nn.Linear(hidden_size, hidden_size)
        self.v_a = nn.Parameter(torch.rand(hidden_size))

    def forward(self, img_features, h):
        """
        forward of the AttentionModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """

        # N = batch_size
        batch_size = img_features.size(0)

        # 1 x N x hidden_size
        h_a = h.repeat(img_features.size(1), 1, 1).permute(1, 0, 2)
        # N x 64 x hidden_size

        # attention scoring function v_a(tanh(W_a[s;h]))
        # tanh(W_a[s;h])
        energy =\
        torch.tanh(
            self.W_a(
                h_a + img_features
            )
        ).permute(0, 2, 1)
        # N x hidden_size x 64

        # hidden_size
        v = self.v_a.repeat(batch_size, 1).unsqueeze(1)
        # N x 1 x hidden_size

        # v_a(tanh(W_a[s;h]))
        # torch.bmm takes 3D tensors
        attention = torch.bmm(v, energy)
        # N x 1 x 64

        attention_weights = F.softmax(attention, dim=2)
        # N x 1 x 64

        return attention_weights

In [15]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        feature_size,
        vocab_size,
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        feature_size: int
            the number of features in the image matrix
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()

        self.feature_size = feature_size
        self.hidden_size = hidden_size

        self.dropout = nn.Dropout(p=0.5)
        self.relu = nn.ReLU()

        self.out_dense = nn.Linear(hidden_size, hidden_size)
        self.h_dense = nn.Linear(feature_size, hidden_size)
        self.c_dense = nn.Linear(feature_size, hidden_size)  
        self.img_dense = nn.Linear(feature_size, hidden_size)
        
        self.embedding =\
        nn.Embedding(vocab_size,embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
                'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.attention =\
        AttentionModel(feature_size, hidden_size)
        
        self.lstm =\
        nn.LSTM(hidden_size, hidden_size, batch_first=True)
      

    def forward(self, img_features, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        img_features: torch.Tensor 
            the image feature matrix
            (N x feature_size(2048) x 8 x 8)
        captions: torch.Tensor 
            the padded caption matrix
            (N x seq_len)

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # N = batch_size
        batch_size = captions.size(0)
        seq_len = captions.size(1)

        # N x feature_size(2048) x 8 x 8
        img_features =\
        img_features.view(
            batch_size, self.feature_size, -1
        ).permute(0, 2, 1)
        # N x 64 x feature_size(2048)

        # N x 64 x feature_size(2048)
        h = self.h_dense(img_features.mean(dim=1)).unsqueeze(0)
        c = self.c_dense(img_features.mean(dim=1)).unsqueeze(0)
        # 1 x N x hidden_size
       
        # N x feature_size(2048) x 8 x 8
        img_features =\
        self.relu(
            self.img_dense(
                self.dropout(
                    img_features
                )
            )
        )  
        # N x 64 x hidden_size

        # N x seq_len
        embed =\
        self.dropout(
            self.embedding(
                captions
            )
        )
        # N x seq_len x embedding_dim
        
        outputs =\
        torch.zeros(
            batch_size,
            seq_len, 
            self.hidden_size
        ).to(device)

        all_attention_weights =\
        torch.zeros(
            batch_size,
            seq_len, 
            img_features.shape[1]
        ).to(device)
        
        for i in range(seq_len):

            attention_weights = self.attention(img_features, h)
            # N x 1 x 64

            # weighted sum of img_features
            weighted = torch.bmm(attention_weights, img_features)
            # N x 1 x hidden_size

            output, (h, c) =\
            self.lstm(
                embed[:, i, :].unsqueeze(1) + weighted,
                (h, c)
            )
            # outputs: N x 1 x hidden_size
            # h: 1 x N x hidden_size
            # c: 1 x N x hidden_size

            output =\
            self.out_dense(
                output.squeeze(1) + weighted.squeeze(1) + embed[:, i, :]
            )
            # N x hidden_size
 
            outputs[:, i, :] = output.squeeze()
            all_attention_weights[:, i, :] = attention_weights.squeeze()

        return outputs, all_attention_weights



In [16]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        self.feature_size = 2048

        self.decoder = RNNModel(
            self.feature_size,
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )

        self.relu = nn.ReLU()
        self.dense = nn.Linear(hidden_size, vocab_size) 

    # def forward(self, captions):
    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor 
            the image feature matrix
            (N x feature_size(2048) x 8 x 8)
        captions: torch.Tensor 
            the padded caption matrix
            (N x seq_len)

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        decoder_out, all_attention_weights = self.decoder(img_features, captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense(
            self.relu(
                decoder_out
            )
        )

        return outputs, all_attention_weights

### Train the Neural Network

In [17]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs, all_attention_weights = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        ) + ((1. - all_attention_weights.sum(dim=1)) ** 2).mean()
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [18]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img, in_seq


In [19]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [20]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [21]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in img_paths:
        img_features.append(
            encode_image(model, image_path).cpu().data.numpy()
        )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [22]:
def get_train_test(
    encoder,
    train_paths,
    test_paths,
    sydney_paths
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    
    sydney_img_features = extract_img_features(
        sydney_paths,
        encoder
    )
    
    return train_img_features, test_img_features, sydney_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    hidden_size=256,
):

    caption_model = CaptionModel(
        vocab_size, 
        embedding_dim, 
        hidden_size=hidden_size,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [30]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat, _ = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [24]:

def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [25]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [26]:
cnn_type = 'inception_v3'
encoder = CNNModel(pretrained=True)
encoder.to(device)

CNNModel(
  (model): Sequential(
    (0): BasicConv2d(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicConv2d(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): BasicConv2d(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): BasicConv2d(
      (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): BasicConv2d(
      (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1

In [27]:
print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')

vocab = get_vocab(train_descriptions, word_count_threshold=10)
idxtoword, wordtoidx = get_word_dict(vocab)
vocab_size = get_vocab_size(idxtoword)
embedding_dim = 500
embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

print(f'Preparing dataloader...')
train_img_features, test_img_features, sydney_img_features = get_train_test(encoder, train_paths, test_paths, sydney_paths)

train_loader = get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=1000
)

print(f'Training...')
caption_model = train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    hidden_size=500
)


10416 images for training and 2605 images for testing.
There are 52080 captions
preprocessed words 2917 ==> 991
The vocabulary size is 992.
887 out of 992 words are found in the pre-trained matrix.
The size of embedding_matrix is (992, 500)
Preparing dataloader...

Generating set took: 0:04:15.45

Generating set took: 0:01:03.95


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:15.27
Training...


  2%|▏         | 1/60 [00:08<08:40,  8.82s/it]

11.531352866779674


  3%|▎         | 2/60 [00:17<08:31,  8.82s/it]

4.528029788624156


  5%|▌         | 3/60 [00:26<08:22,  8.81s/it]

3.572889197956432


  7%|▋         | 4/60 [00:35<08:13,  8.82s/it]

2.9836453741247


  8%|▊         | 5/60 [00:44<08:04,  8.81s/it]

2.675373770973899


 10%|█         | 6/60 [00:52<07:56,  8.82s/it]

2.4664069522510874


 12%|█▏        | 7/60 [01:01<07:47,  8.82s/it]

2.3015224066647617


 13%|█▎        | 8/60 [01:10<07:39,  8.83s/it]

2.168319507078691


 15%|█▌        | 9/60 [01:19<07:30,  8.83s/it]

2.052783662622625


 17%|█▋        | 10/60 [01:28<07:21,  8.84s/it]

1.973806077783758


 18%|█▊        | 11/60 [01:37<07:13,  8.85s/it]

1.9179563522338867


 20%|██        | 12/60 [01:45<07:04,  8.85s/it]

1.8412234024568037


 22%|██▏       | 13/60 [01:54<06:55,  8.85s/it]

1.8069313222711736


 23%|██▎       | 14/60 [02:03<06:46,  8.84s/it]

1.7543622580441562


 25%|██▌       | 15/60 [02:12<06:38,  8.85s/it]

1.6826051798733799


 27%|██▋       | 16/60 [02:21<06:29,  8.84s/it]

1.5980253978209062


 28%|██▊       | 17/60 [02:30<06:20,  8.84s/it]

1.540902083570307


 30%|███       | 18/60 [02:39<06:11,  8.85s/it]

1.4899693727493286


 32%|███▏      | 19/60 [02:47<06:02,  8.85s/it]

1.4433630271391436


 33%|███▎      | 20/60 [02:56<05:54,  8.85s/it]

1.417888717217879


 35%|███▌      | 21/60 [03:05<05:45,  8.86s/it]

1.4006921378048984


 37%|███▋      | 22/60 [03:14<05:37,  8.87s/it]

1.4047475511377507


 38%|███▊      | 23/60 [03:23<05:29,  8.91s/it]

1.3930240869522095


 40%|████      | 24/60 [03:32<05:20,  8.89s/it]

1.3627967617728494


 42%|████▏     | 25/60 [03:41<05:11,  8.89s/it]

1.3258016217838635


 43%|████▎     | 26/60 [03:50<05:02,  8.89s/it]

1.2869950532913208


 45%|████▌     | 27/60 [03:59<04:53,  8.89s/it]

1.2468342130834407


 47%|████▋     | 28/60 [04:07<04:44,  8.88s/it]

1.2250028360973706


 48%|████▊     | 29/60 [04:16<04:35,  8.87s/it]

1.2054418217052112


 50%|█████     | 30/60 [04:25<04:26,  8.87s/it]

1.19136643409729


 52%|█████▏    | 31/60 [04:34<04:17,  8.87s/it]

1.17823725938797


 53%|█████▎    | 32/60 [04:43<04:08,  8.88s/it]

1.1536333127455278


 55%|█████▌    | 33/60 [04:52<03:59,  8.87s/it]

1.1240471547300166


 57%|█████▋    | 34/60 [05:01<03:50,  8.88s/it]

1.075504113327373


 58%|█████▊    | 35/60 [05:10<03:42,  8.89s/it]

1.0418031324039807


 60%|██████    | 36/60 [05:18<03:33,  8.88s/it]

1.0257358876141636


 62%|██████▏   | 37/60 [05:27<03:24,  8.88s/it]

1.0180470618334683


 63%|██████▎   | 38/60 [05:36<03:15,  8.88s/it]

1.0085822885686702


 65%|██████▌   | 39/60 [05:45<03:06,  8.88s/it]

0.9988288066603921


 67%|██████▋   | 40/60 [05:54<02:57,  8.88s/it]

0.9927017905495383


 68%|██████▊   | 41/60 [06:03<02:48,  8.88s/it]

0.9848800464109941


 70%|███████   | 42/60 [06:12<02:39,  8.88s/it]

0.9808894991874695


 72%|███████▏  | 43/60 [06:21<02:30,  8.88s/it]

0.9830321127718146


 73%|███████▎  | 44/60 [06:29<02:21,  8.87s/it]

1.0029569755900989


 75%|███████▌  | 45/60 [06:38<02:13,  8.87s/it]

1.0026226206259294


 77%|███████▋  | 46/60 [06:47<02:04,  8.87s/it]

0.9846102107654918


 78%|███████▊  | 47/60 [06:56<01:55,  8.91s/it]

0.9483251788399436


 80%|████████  | 48/60 [07:05<01:46,  8.90s/it]

0.9140574552796104


 82%|████████▏ | 49/60 [07:14<01:37,  8.89s/it]

0.8869555538350885


 83%|████████▎ | 50/60 [07:23<01:28,  8.88s/it]

0.8736127289858732


 85%|████████▌ | 51/60 [07:32<01:19,  8.88s/it]

0.8662859472361478


 87%|████████▋ | 52/60 [07:41<01:11,  8.90s/it]

0.8699491999366067


 88%|████████▊ | 53/60 [07:50<01:02,  8.93s/it]

0.8738567720759999


 90%|█████████ | 54/60 [07:59<00:53,  8.93s/it]

0.8899632258848711


 92%|█████████▏| 55/60 [08:07<00:44,  8.93s/it]

0.899897808378393


 93%|█████████▎| 56/60 [08:16<00:35,  8.94s/it]

0.9058294025334445


 95%|█████████▌| 57/60 [08:25<00:26,  8.95s/it]

0.9054134867408059


 97%|█████████▋| 58/60 [08:34<00:17,  8.96s/it]

0.8752878904342651


 98%|█████████▊| 59/60 [08:43<00:08,  8.96s/it]

0.8500724597410723


100%|██████████| 60/60 [08:52<00:00,  8.88s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.8323357592929493


  2%|▏         | 1/60 [00:08<08:46,  8.92s/it]

0.8014971180395647


  3%|▎         | 2/60 [00:17<08:37,  8.92s/it]

0.7739911838011309


  5%|▌         | 3/60 [00:26<08:29,  8.93s/it]

0.7489076365124095


  7%|▋         | 4/60 [00:35<08:21,  8.95s/it]

0.7345530878413807


  8%|▊         | 5/60 [00:44<08:11,  8.94s/it]

0.7252400571649725


 10%|█         | 6/60 [00:53<08:04,  8.96s/it]

0.7173572670329701


 12%|█▏        | 7/60 [01:02<07:54,  8.95s/it]

0.7152429060502485


 13%|█▎        | 8/60 [01:11<07:44,  8.93s/it]

0.7096012017943643


 15%|█▌        | 9/60 [01:20<07:34,  8.91s/it]

0.7075597643852234


 17%|█▋        | 10/60 [01:29<07:24,  8.89s/it]

0.705385609106584


 18%|█▊        | 11/60 [01:38<07:17,  8.92s/it]

0.702137676152316


 20%|██        | 12/60 [01:47<07:07,  8.91s/it]

0.699524765664881


 22%|██▏       | 13/60 [01:56<06:59,  8.92s/it]

0.6981958042491566


 23%|██▎       | 14/60 [02:05<06:50,  8.93s/it]

0.6955294446511702


 25%|██▌       | 15/60 [02:13<06:41,  8.93s/it]

0.6961800726977262


 27%|██▋       | 16/60 [02:22<06:33,  8.95s/it]

0.6949228102510626


 28%|██▊       | 17/60 [02:31<06:24,  8.93s/it]

0.6908125606450167


 30%|███       | 18/60 [02:40<06:14,  8.92s/it]

0.6903375414284793


 32%|███▏      | 19/60 [02:49<06:05,  8.91s/it]

0.6918817054141652


 33%|███▎      | 20/60 [02:58<05:56,  8.90s/it]

0.6878514967181466


 35%|███▌      | 21/60 [03:07<05:47,  8.90s/it]

0.6869351809675043


 37%|███▋      | 22/60 [03:16<05:38,  8.90s/it]

0.6858197504823859


 38%|███▊      | 23/60 [03:25<05:28,  8.89s/it]

0.6841324838725004


 40%|████      | 24/60 [03:34<05:20,  8.89s/it]

0.6828777112744071


 42%|████▏     | 25/60 [03:42<05:11,  8.90s/it]

0.6826195473020727


 43%|████▎     | 26/60 [03:51<05:02,  8.90s/it]

0.6804685890674591


 45%|████▌     | 27/60 [04:00<04:53,  8.90s/it]

0.6787302629514174


 47%|████▋     | 28/60 [04:09<04:44,  8.90s/it]

0.6787790222601457


 48%|████▊     | 29/60 [04:18<04:36,  8.90s/it]

0.6776874444701455


 50%|█████     | 30/60 [04:27<04:27,  8.90s/it]

0.6781256361441179


 52%|█████▏    | 31/60 [04:36<04:18,  8.90s/it]

0.6776003647934307


 53%|█████▎    | 32/60 [04:45<04:08,  8.89s/it]

0.6745534430850636


 55%|█████▌    | 33/60 [04:54<04:00,  8.89s/it]

0.6754027821800925


 57%|█████▋    | 34/60 [05:03<03:51,  8.90s/it]

0.6758045852184296


 58%|█████▊    | 35/60 [05:11<03:42,  8.90s/it]

0.6744903894987974


 60%|██████    | 36/60 [05:21<03:34,  8.95s/it]

0.6720633886077187


 62%|██████▏   | 37/60 [05:29<03:25,  8.93s/it]

0.6719460378993641


 63%|██████▎   | 38/60 [05:38<03:16,  8.92s/it]

0.6704863797534596


 65%|██████▌   | 39/60 [05:47<03:06,  8.90s/it]

0.669807027686726


 67%|██████▋   | 40/60 [05:56<02:57,  8.90s/it]

0.6697277209975503


 68%|██████▊   | 41/60 [06:05<02:49,  8.90s/it]

0.6707493391903964


 70%|███████   | 42/60 [06:14<02:39,  8.89s/it]

0.6680589589205655


 72%|███████▏  | 43/60 [06:23<02:31,  8.89s/it]

0.6678056093779478


 73%|███████▎  | 44/60 [06:32<02:22,  8.89s/it]

0.6675813631577925


 75%|███████▌  | 45/60 [06:40<02:13,  8.88s/it]

0.6677047501910817


 77%|███████▋  | 46/60 [06:49<02:04,  8.88s/it]

0.6651784669269215


 78%|███████▊  | 47/60 [06:58<01:55,  8.89s/it]

0.6624934510751204


 80%|████████  | 48/60 [07:07<01:46,  8.88s/it]

0.6654316295276989


 82%|████████▏ | 49/60 [07:16<01:37,  8.89s/it]

0.6647650003433228


 83%|████████▎ | 50/60 [07:25<01:28,  8.89s/it]

0.6630112474614923


 85%|████████▌ | 51/60 [07:34<01:20,  8.91s/it]

0.6613883999260989


 87%|████████▋ | 52/60 [07:43<01:11,  8.93s/it]

0.6616353717717257


 88%|████████▊ | 53/60 [07:52<01:02,  8.94s/it]

0.6608300100673329


 90%|█████████ | 54/60 [08:01<00:53,  8.95s/it]

0.6609610481695696


 92%|█████████▏| 55/60 [08:10<00:44,  8.95s/it]

0.6587798974730752


 93%|█████████▎| 56/60 [08:19<00:35,  8.96s/it]

0.6600929281928323


 95%|█████████▌| 57/60 [08:28<00:26,  8.96s/it]

0.6584397662769664


 97%|█████████▋| 58/60 [08:37<00:17,  8.95s/it]

0.6579312898895957


 98%|█████████▊| 59/60 [08:46<00:08,  8.94s/it]

0.6578274856914174


100%|██████████| 60/60 [08:55<00:00,  8.92s/it]

0.6584045020016757





In [28]:
model_score = {}

In [31]:
model_score['test'] = evaluate_results(
    test_img_features, 
    caption_model,
    test_descriptions,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)

Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [32]:
model_score['sydney'] = evaluate_results(
    sydney_img_features, 
    caption_model,
    sydney_descriptions,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)

Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [33]:
model_score

{'test': {'Bleu_1': 0.5467581500036104,
  'Bleu_2': 0.4087663844689084,
  'Bleu_3': 0.32512121739139066,
  'Bleu_4': 0.26886051357006757,
  'METEOR': 0.23903028481314356,
  'ROUGE_L': 0.4564729589373808,
  'CIDEr': 1.3849905737930095,
  'SPICE': 0.29957667835204815,
  'USC_similarity': 0.5348150259856984},
 'sydney': {'Bleu_1': 0.44429736674300774,
  'Bleu_2': 0.20981761787255915,
  'Bleu_3': 0.10361200079840802,
  'Bleu_4': 0.05825901985648426,
  'METEOR': 0.14017621280548817,
  'ROUGE_L': 0.27955123258922676,
  'CIDEr': 0.1812447308749461,
  'SPICE': 0.12269731703994054,
  'USC_similarity': 0.4592353720102478}}

In [34]:
tag = '11.1.2'
with open(f'{root_captioning}/fz_notebooks/final_results_n{tag}.json', 'w') as fp:
    json.dump(model_score, fp)