## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

sys.path.append('../../scr/evaluation')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)



In [7]:
all_paths = train_paths.copy()
all_paths.extend(test_paths.copy())
all_paths = np.array(all_paths)

all_descriptions = train_descriptions.copy()
all_descriptions.extend(test_descriptions.copy())
all_descriptions = np.array(all_descriptions)

captions = all_descriptions.copy()
max_length_all = max(max_length_train, max_length_test)
max_length = max_length_all + 2
      
lex = set()
for sen in all_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in all_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(all_paths)} images') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')


There are 10416 images
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.


In [8]:
all_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
all_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Wikipedia2vec Embeddings

In [10]:
# read the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
    embeddings_index = json.load(file)

In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count = 0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        # inception v3 expects (299, 299) sized images
        self.model = models.inception_v3(pretrained=pretrained, aux_logits=False)
        # remove the classification layer
        self.model =\
        nn.Sequential(
            *(list(self.model.children())[: 3]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[3: 5]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[5: -1])
        )

        self.input_size = 299

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
          # set the model to evaluation model
          self.model.eval()

        # N x 3 x 299 x 299
        features = self.model(img_input)
        # N x 2048 x 8 x 8

        return features

In [14]:
class AttentionModel(nn.Module):

    def __init__(self, embedding_dim, hidden_size, atten_size=512):
        """
        Initializes a AttentionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(AttentionModel, self).__init__()

        self.dense1 = nn.Linear(hidden_size, atten_size)
        self.dense2 = nn.Linear(embedding_dim, atten_size)
        self.dense3 = nn.Linear(atten_size, 1)
        self.relu = nn.ReLU()

    def forward(self, img_features, h):
        """
        forward of the AttentionModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        # N x hidden_size
        h_a = self.dense1(h)
        # N x atten_size

        # N x 64 x embedding_dim
        img_a = self.dense2(img_features)  
        # N x 64 x atten_size

        attention =\
        self.dense3(
            self.relu(
                h_a.unsqueeze(1) + img_a
            )
        ).squeeze(2)
        # N x 64

        attention_weights = F.softmax(attention, dim=1)
        # N x 64

        return attention_weights

In [15]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        feature_size,
        vocab_size,
        embedding_dim, 
        hidden_size=256,
        atten_size=512,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        feature_size: int
            the number of features in the image matrix
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()

        self.feature_size = feature_size
        self.hidden_size = hidden_size

        self.embedding =\
        nn.Embedding(
            vocab_size,
            embedding_dim, 
            padding_idx=0
        )

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
                'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train



        self.attention =\
        AttentionModel(
            embedding_dim,
            hidden_size,
            atten_size
        )


        self.lstm =\
        nn.LSTMCell(
#             embedding_dim + embedding_dim, 
            embedding_dim,
            hidden_size, 
            bias=True
        )
        
        self.dense1 = nn.Linear(feature_size, embedding_dim)
        self.dense2 = nn.Linear(hidden_size, embedding_dim)
        self.dense3 = nn.Linear(feature_size, hidden_size)
        self.dense4 = nn.Linear(feature_size, hidden_size)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.5)
        self.relu = nn.ReLU()

    def init_hidden(self, batch_size):
        x = torch.autograd.Variable(next(self.parameters()).data.new(batch_size, self.hidden_size))
        return x

    def forward(self, img_features, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        img_features: torch.Tensor 
            the image feature matrix
            (N x feature_size(2048) x 8 x 8)
        captions: torch.Tensor 
            the padded caption matrix
            (N x seq_len)

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # N = batch_size
        batch_size = captions.size(0)
        seq_len = captions.size(1)

        # N x feature_size(2048) x 8 x 8
        img_features = img_features.view(
            batch_size, self.feature_size, -1
        ).permute(0, 2, 1)
        # N x 64 x feature_size(2048)

        # h = self.init_hidden(batch_size).to(device)
        # c = self.init_hidden(batch_size).to(device)
        # # N x hidden_size

        # N x 64 x feature_size(2048)
        h = self.dense3(img_features.mean(dim=1))
        c = self.dense4(img_features.mean(dim=1))
        # N x hidden_size

        # N x seq_len
        embed = self.dropout(self.embedding(captions))
        # N x seq_len x embedding_dim     

        # N x 64 x feature_size(2048)
        img_features = self.dropout(self.relu(self.dense1(img_features)))
        # N x 64 x embedding_dim

        outputs =\
        torch.zeros(
            batch_size,
            seq_len, 
            self.hidden_size
        ).to(device)

        all_attention_weights =\
        torch.zeros(
            batch_size,
            seq_len, 
            img_features.shape[1]
        ).to(device)
        
        for i in range(seq_len):
            
            attention_weights = self.attention(img_features, h)
            # N x 64

            # weighted sum of img_features
            weighted = (img_features * attention_weights.unsqueeze(2)).sum(dim=1)
            # N x embedding_dim

            # gating scalar
            # N x hidden_size
            gate = self.sigmoid(self.dense2(h))
            # N x embedding_dim

            weighted = gate * weighted

            h, c =\
            self.lstm(
                embed[:, i, :] + weighted, 
               (h, c)
            )
            # h: N x hidden_size
            # c: N x hidden_size
 
            outputs[:, i, :] = h
            all_attention_weights[:, i, :] = attention_weights

        return outputs, all_attention_weights



In [16]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        atten_size=512,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        self.feature_size = 2048

        self.decoder = RNNModel(
            self.feature_size,
            vocab_size, 
            embedding_dim,
            hidden_size,
            atten_size,
            embedding_matrix,
            embedding_train
        )


        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(hidden_size, vocab_size) 
        # self.relu = nn.ReLU()
        # self.dense2 = nn.Linear(512, vocab_size) 

    # def forward(self, captions):
    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor 
            the image feature matrix
            (N x feature_size(2048) x 8 x 8)
        captions: torch.Tensor 
            the padded caption matrix
            (N x seq_len)

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """
    
        decoder_out, all_attention_weights = self.decoder(img_features, captions)

        # add up decoder outputs and image features
        outputs = self.dense1(decoder_out)

        return outputs, all_attention_weights

### Train the Neural Network

In [17]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs, all_attention_weights = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        ) + ((1. - all_attention_weights.sum(dim=1)) ** 2).mean()
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [18]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img, in_seq


In [19]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [20]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [21]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in tqdm(img_paths):
        img_features.append(
            encode_image(model, image_path).cpu().data.numpy()
        )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [22]:
def get_train_test(
    encoder,
    train_paths,
    test_paths
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    return train_img_features, test_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    hidden_size=256,
):

    caption_model = CaptionModel(
        vocab_size, 
        embedding_dim, 
        hidden_size=500,
        atten_size=256,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 3)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 5)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [23]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat, _ = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [24]:
def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [25]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [26]:
cnn_type = 'inception_v3'
encoder = CNNModel(pretrained=True)
encoder.to(device)

CNNModel(
  (model): Sequential(
    (0): BasicConv2d(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicConv2d(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): BasicConv2d(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): BasicConv2d(
      (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): BasicConv2d(
      (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1

In [27]:
def cross_validation(train_index, test_index, count):
    print('=' * 60)
    print(f'Split {count}:')
    print(f'Splitting data...')
    
    train_paths, test_paths = all_paths[train_index], all_paths[test_index]
    train_descriptions, test_descriptions = all_descriptions[train_index], all_descriptions[test_index]
    print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')
    
    vocab = get_vocab(train_descriptions, word_count_threshold=10)
    idxtoword, wordtoidx = get_word_dict(vocab)
    vocab_size = get_vocab_size(idxtoword)
    embedding_dim = 500
    embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

    print(f'Preparing dataloader...')
    train_img_features, test_img_features = get_train_test(encoder, train_paths, test_paths)

    train_loader = get_train_dataloader(
        train_descriptions, 
        train_img_features,
        wordtoidx,
        max_length,
        batch_size=1000
    )

    print(f'Training...')
    caption_model = train_model(
        train_loader,
        vocab_size,
        embedding_dim, 
        embedding_matrix,
        hidden_size=500
    )

    
    ref = captions[test_index]
    model_score = evaluate_results(
        test_img_features, 
        caption_model,
        ref,
        max_length,
        vocab_size,
        wordtoidx,
        idxtoword
    )
    
    return caption_model, model_score

In [28]:
cv = KFold(n_splits=5, random_state=123, shuffle=True)
cv = [(train_index, test_index) for train_index, test_index in cv.split(all_paths)]  

In [29]:
caption_model1, model_score1 = cross_validation(cv[0][0], cv[0][1], 1)    

Split 1:
Splitting data...
8332 images for training and 2084 images for testing.
There are 41660 captions


  0%|          | 0/8332 [00:00<?, ?it/s]

preprocessed words 2659 ==> 884
The vocabulary size is 885.
793 out of 885 words are found in the pre-trained matrix.
The size of embedding_matrix is (885, 500)
Preparing dataloader...


100%|██████████| 8332/8332 [03:38<00:00, 38.12it/s]
  0%|          | 4/2084 [00:00<00:52, 39.81it/s]


Generating set took: 0:03:38.55


100%|██████████| 2084/2084 [00:55<00:00, 37.54it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:00:55.51
Training...


  3%|▎         | 1/30 [00:05<02:29,  5.16s/it]

7.740195433298747


  7%|▋         | 2/30 [00:10<02:23,  5.11s/it]

4.56277863184611


 10%|█         | 3/30 [00:15<02:17,  5.09s/it]

3.950769239001804


 13%|█▎        | 4/30 [00:20<02:11,  5.07s/it]

3.0140631198883057


 17%|█▋        | 5/30 [00:25<02:06,  5.05s/it]

2.4975806872049966


 20%|██        | 6/30 [00:30<02:01,  5.04s/it]

2.1976337830225625


 23%|██▎       | 7/30 [00:35<01:55,  5.04s/it]

1.9837774965498183


 27%|██▋       | 8/30 [00:40<01:50,  5.03s/it]

1.8042560948265924


 30%|███       | 9/30 [00:45<01:45,  5.02s/it]

1.6653809150060017


 33%|███▎      | 10/30 [00:50<01:40,  5.02s/it]

1.5606728659735785


 37%|███▋      | 11/30 [00:55<01:35,  5.03s/it]

1.4731748236550226


 40%|████      | 12/30 [01:00<01:30,  5.03s/it]

1.4004019896189372


 43%|████▎     | 13/30 [01:05<01:25,  5.03s/it]

1.3314256535636053


 47%|████▋     | 14/30 [01:10<01:20,  5.03s/it]

1.2692345447010465


 50%|█████     | 15/30 [01:15<01:15,  5.02s/it]

1.2132419678899977


 53%|█████▎    | 16/30 [01:20<01:10,  5.02s/it]

1.1614938312106662


 57%|█████▋    | 17/30 [01:25<01:05,  5.02s/it]

1.1194183627764385


 60%|██████    | 18/30 [01:30<01:00,  5.03s/it]

1.073613550927904


 63%|██████▎   | 19/30 [01:35<00:55,  5.03s/it]

1.0272055466969807


 67%|██████▋   | 20/30 [01:40<00:50,  5.03s/it]

0.9943109220928616


 70%|███████   | 21/30 [01:45<00:45,  5.03s/it]

0.9713623523712158


 73%|███████▎  | 22/30 [01:50<00:40,  5.03s/it]

0.9472174644470215


 77%|███████▋  | 23/30 [01:55<00:35,  5.03s/it]

0.9249449504746331


 80%|████████  | 24/30 [02:00<00:30,  5.02s/it]

0.9080767499075996


 83%|████████▎ | 25/30 [02:05<00:25,  5.02s/it]

0.8871663941277398


 87%|████████▋ | 26/30 [02:10<00:20,  5.02s/it]

0.8685512211587694


 90%|█████████ | 27/30 [02:15<00:15,  5.03s/it]

0.8606519235504998


 93%|█████████▎| 28/30 [02:20<00:10,  5.02s/it]

0.8499305182033114


 97%|█████████▋| 29/30 [02:25<00:05,  5.02s/it]

0.8470937808354696


100%|██████████| 30/30 [02:30<00:00,  5.03s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

0.8153672218322754


  2%|▏         | 1/50 [00:05<04:05,  5.02s/it]

0.7792844573656718


  4%|▍         | 2/50 [00:10<04:01,  5.02s/it]

0.7619901100794474


  6%|▌         | 3/50 [00:15<03:56,  5.02s/it]

0.7458620468775431


  8%|▊         | 4/50 [00:20<03:51,  5.02s/it]

0.735110859076182


 10%|█         | 5/50 [00:25<03:45,  5.02s/it]

0.7279557320806715


 12%|█▏        | 6/50 [00:30<03:40,  5.01s/it]

0.7241767512427436


 14%|█▍        | 7/50 [00:35<03:35,  5.02s/it]

0.7196984026167128


 16%|█▌        | 8/50 [00:40<03:32,  5.06s/it]

0.7160657379362318


 18%|█▊        | 9/50 [00:45<03:26,  5.04s/it]

0.7135839329825507


 20%|██        | 10/50 [00:50<03:21,  5.04s/it]

0.7124939560890198


 22%|██▏       | 11/50 [00:55<03:16,  5.03s/it]

0.7107047438621521


 24%|██▍       | 12/50 [01:00<03:11,  5.03s/it]

0.7069906062550015


 26%|██▌       | 13/50 [01:05<03:06,  5.04s/it]

0.7073462870385911


 28%|██▊       | 14/50 [01:10<03:01,  5.04s/it]

0.705064038435618


 30%|███       | 15/50 [01:15<02:56,  5.03s/it]

0.7048170765240988


 32%|███▏      | 16/50 [01:20<02:51,  5.03s/it]

0.7029977507061429


 34%|███▍      | 17/50 [01:25<02:45,  5.03s/it]

0.7005718946456909


 36%|███▌      | 18/50 [01:30<02:40,  5.03s/it]

0.7005954186121622


 38%|███▊      | 19/50 [01:35<02:35,  5.03s/it]

0.6998857723342048


 40%|████      | 20/50 [01:40<02:30,  5.03s/it]

0.697802311844296


 42%|████▏     | 21/50 [01:45<02:25,  5.02s/it]

0.6972567174169753


 44%|████▍     | 22/50 [01:50<02:20,  5.02s/it]

0.6959434747695923


 46%|████▌     | 23/50 [01:55<02:15,  5.03s/it]

0.6957001884778341


 48%|████▊     | 24/50 [02:00<02:10,  5.03s/it]

0.6944442192713419


 50%|█████     | 25/50 [02:05<02:05,  5.02s/it]

0.6936878032154508


 52%|█████▏    | 26/50 [02:10<02:00,  5.03s/it]

0.6931205656793382


 54%|█████▍    | 27/50 [02:15<01:55,  5.02s/it]

0.6919044256210327


 56%|█████▌    | 28/50 [02:20<01:50,  5.02s/it]

0.6900553570853339


 58%|█████▊    | 29/50 [02:25<01:45,  5.03s/it]

0.690370606051551


 60%|██████    | 30/50 [02:30<01:40,  5.03s/it]

0.689636402659946


 62%|██████▏   | 31/50 [02:35<01:36,  5.06s/it]

0.6884917219479879


 64%|██████▍   | 32/50 [02:41<01:31,  5.06s/it]

0.6875952018631829


 66%|██████▌   | 33/50 [02:46<01:25,  5.05s/it]

0.6869271265135871


 68%|██████▊   | 34/50 [02:51<01:20,  5.04s/it]

0.6864052613576254


 70%|███████   | 35/50 [02:56<01:15,  5.04s/it]

0.6865807705455356


 72%|███████▏  | 36/50 [03:01<01:10,  5.07s/it]

0.6861946251657274


 74%|███████▍  | 37/50 [03:06<01:05,  5.06s/it]

0.6844546066390144


 76%|███████▌  | 38/50 [03:11<01:00,  5.05s/it]

0.6828328106138442


 78%|███████▊  | 39/50 [03:16<00:55,  5.04s/it]

0.6821036140124003


 80%|████████  | 40/50 [03:21<00:50,  5.04s/it]

0.6825654771592882


 82%|████████▏ | 41/50 [03:26<00:45,  5.03s/it]

0.6816604402330186


 84%|████████▍ | 42/50 [03:31<00:40,  5.03s/it]

0.6812281277444627


 86%|████████▌ | 43/50 [03:36<00:35,  5.02s/it]

0.6807054148779975


 88%|████████▊ | 44/50 [03:41<00:30,  5.04s/it]

0.6804267830318875


 90%|█████████ | 45/50 [03:46<00:25,  5.03s/it]

0.679461936155955


 92%|█████████▏| 46/50 [03:51<00:20,  5.03s/it]

0.6791939602957832


 94%|█████████▍| 47/50 [03:56<00:15,  5.04s/it]

0.6779555545912849


 96%|█████████▌| 48/50 [04:01<00:10,  5.03s/it]

0.6786816186375089


 98%|█████████▊| 49/50 [04:06<00:05,  5.03s/it]

0.6769397854804993


100%|██████████| 50/50 [04:11<00:00,  5.03s/it]

0.6756658222940233
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [30]:
model_score1

{'Bleu_1': 0.5420602587634623,
 'Bleu_2': 0.3944384687544192,
 'Bleu_3': 0.3076348082784494,
 'Bleu_4': 0.2505738856582193,
 'METEOR': 0.2305141226280994,
 'ROUGE_L': 0.44552871905314234,
 'CIDEr': 1.2912744435619947,
 'SPICE': 0.29100613954932025,
 'USC_similarity': 0.516662359149365}

In [31]:
caption_model2, model_score2 = cross_validation(cv[1][0], cv[1][1], 2)    

Split 2:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 5/8333 [00:00<03:23, 40.90it/s]

preprocessed words 2688 ==> 916
The vocabulary size is 917.
819 out of 917 words are found in the pre-trained matrix.
The size of embedding_matrix is (917, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:26<00:00, 40.43it/s]
  0%|          | 5/2083 [00:00<00:50, 40.89it/s]


Generating set took: 0:03:26.10


100%|██████████| 2083/2083 [00:50<00:00, 40.91it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:00:50.92
Training...


  3%|▎         | 1/30 [00:05<02:30,  5.19s/it]

6.091214127010769


  7%|▋         | 2/30 [00:10<02:24,  5.16s/it]

4.459252940283881


 10%|█         | 3/30 [00:15<02:18,  5.14s/it]

3.343069871266683


 13%|█▎        | 4/30 [00:20<02:12,  5.11s/it]

2.6376596556769476


 17%|█▋        | 5/30 [00:25<02:07,  5.08s/it]

2.288652631971571


 20%|██        | 6/30 [00:30<02:01,  5.07s/it]

2.0574027034971447


 23%|██▎       | 7/30 [00:35<01:56,  5.06s/it]

1.8680434491899278


 27%|██▋       | 8/30 [00:40<01:51,  5.05s/it]

1.7162432670593262


 30%|███       | 9/30 [00:45<01:45,  5.04s/it]

1.59210823641883


 33%|███▎      | 10/30 [00:50<01:40,  5.05s/it]

1.4974021116892497


 37%|███▋      | 11/30 [00:55<01:35,  5.05s/it]

1.4147374629974365


 40%|████      | 12/30 [01:00<01:30,  5.05s/it]

1.3397164874606662


 43%|████▎     | 13/30 [01:05<01:25,  5.06s/it]

1.296516133679284


 47%|████▋     | 14/30 [01:10<01:20,  5.05s/it]

1.241160790125529


 50%|█████     | 15/30 [01:15<01:15,  5.05s/it]

1.1892808543311224


 53%|█████▎    | 16/30 [01:20<01:10,  5.04s/it]

1.1443513962957594


 57%|█████▋    | 17/30 [01:25<01:05,  5.04s/it]

1.1105898751152887


 60%|██████    | 18/30 [01:31<01:00,  5.04s/it]

1.0624175071716309


 63%|██████▎   | 19/30 [01:36<00:55,  5.05s/it]

1.0214061405923631


 67%|██████▋   | 20/30 [01:41<00:51,  5.11s/it]

1.0006959703233507


 70%|███████   | 21/30 [01:46<00:45,  5.10s/it]

0.9930688606368171


 73%|███████▎  | 22/30 [01:51<00:40,  5.08s/it]

0.9568179382218255


 77%|███████▋  | 23/30 [01:56<00:35,  5.07s/it]

0.9289791915151808


 80%|████████  | 24/30 [02:01<00:30,  5.06s/it]

0.9007644123501248


 83%|████████▎ | 25/30 [02:06<00:25,  5.06s/it]

0.8776506847805448


 87%|████████▋ | 26/30 [02:11<00:20,  5.06s/it]

0.854590098063151


 90%|█████████ | 27/30 [02:16<00:15,  5.06s/it]

0.8444079160690308


 93%|█████████▎| 28/30 [02:21<00:10,  5.05s/it]

0.8314600321981642


 97%|█████████▋| 29/30 [02:26<00:05,  5.06s/it]

0.8261241383022733


100%|██████████| 30/30 [02:31<00:00,  5.06s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

0.8147941960228814


  2%|▏         | 1/50 [00:05<04:09,  5.09s/it]

0.8192484577496847


  4%|▍         | 2/50 [00:10<04:03,  5.08s/it]

0.7903589473830329


  6%|▌         | 3/50 [00:15<03:58,  5.08s/it]

0.76690188381407


  8%|▊         | 4/50 [00:20<03:53,  5.07s/it]

0.7525756557782491


 10%|█         | 5/50 [00:25<03:47,  5.07s/it]

0.7418774565060934


 12%|█▏        | 6/50 [00:30<03:42,  5.06s/it]

0.7377314700020684


 14%|█▍        | 7/50 [00:35<03:37,  5.06s/it]

0.7326780226495531


 16%|█▌        | 8/50 [00:40<03:32,  5.06s/it]

0.7292399009068807


 18%|█▊        | 9/50 [00:45<03:27,  5.05s/it]

0.7260304358270433


 20%|██        | 10/50 [00:50<03:22,  5.05s/it]

0.7236204213566251


 22%|██▏       | 11/50 [00:55<03:17,  5.05s/it]

0.721792995929718


 24%|██▍       | 12/50 [01:00<03:11,  5.05s/it]

0.7212084333101908


 26%|██▌       | 13/50 [01:05<03:06,  5.05s/it]

0.7189912133746676


 28%|██▊       | 14/50 [01:10<03:02,  5.06s/it]

0.7185898688104417


 30%|███       | 15/50 [01:15<02:56,  5.06s/it]

0.716882069905599


 32%|███▏      | 16/50 [01:20<02:51,  5.05s/it]

0.715247200595008


 34%|███▍      | 17/50 [01:25<02:46,  5.05s/it]

0.7139183216624789


 36%|███▌      | 18/50 [01:30<02:41,  5.05s/it]

0.7146949768066406


 38%|███▊      | 19/50 [01:36<02:36,  5.06s/it]

0.7119431959258186


 40%|████      | 20/50 [01:41<02:31,  5.07s/it]

0.7110427882936265


 42%|████▏     | 21/50 [01:46<02:27,  5.08s/it]

0.7108652657932706


 44%|████▍     | 22/50 [01:51<02:22,  5.08s/it]

0.709256629149119


 46%|████▌     | 23/50 [01:56<02:17,  5.08s/it]

0.7081542213757833


 48%|████▊     | 24/50 [02:01<02:12,  5.09s/it]

0.7077624996503195


 50%|█████     | 25/50 [02:06<02:07,  5.09s/it]

0.7072734898991055


 52%|█████▏    | 26/50 [02:11<02:02,  5.09s/it]

0.7070156203375922


 54%|█████▍    | 27/50 [02:16<01:57,  5.09s/it]

0.7060436341497633


 56%|█████▌    | 28/50 [02:21<01:51,  5.08s/it]

0.7042227586110433


 58%|█████▊    | 29/50 [02:26<01:46,  5.07s/it]

0.7046962115499709


 60%|██████    | 30/50 [02:32<01:41,  5.08s/it]

0.7028512292438083


 62%|██████▏   | 31/50 [02:37<01:36,  5.09s/it]

0.7030041548940871


 64%|██████▍   | 32/50 [02:42<01:31,  5.09s/it]

0.7022828658421835


 66%|██████▌   | 33/50 [02:47<01:26,  5.08s/it]

0.7005776696734958


 68%|██████▊   | 34/50 [02:52<01:21,  5.08s/it]

0.7006654474470351


 70%|███████   | 35/50 [02:57<01:16,  5.08s/it]

0.7010076642036438


 72%|███████▏  | 36/50 [03:02<01:11,  5.08s/it]

0.7008151147100661


 74%|███████▍  | 37/50 [03:07<01:05,  5.07s/it]

0.6992158757315742


 76%|███████▌  | 38/50 [03:12<01:01,  5.10s/it]

0.6985186272197299


 78%|███████▊  | 39/50 [03:17<00:55,  5.08s/it]

0.6977766619788276


 80%|████████  | 40/50 [03:22<00:50,  5.08s/it]

0.6975728604528639


 82%|████████▏ | 41/50 [03:27<00:45,  5.09s/it]

0.6970753073692322


 84%|████████▍ | 42/50 [03:33<00:40,  5.09s/it]

0.6960403852992587


 86%|████████▌ | 43/50 [03:38<00:35,  5.11s/it]

0.6954604453510709


 88%|████████▊ | 44/50 [03:43<00:30,  5.11s/it]

0.6934036943647597


 90%|█████████ | 45/50 [03:48<00:25,  5.10s/it]

0.6939700841903687


 92%|█████████▏| 46/50 [03:53<00:20,  5.10s/it]

0.6940890351931254


 94%|█████████▍| 47/50 [03:58<00:15,  5.11s/it]

0.6939594878090752


 96%|█████████▌| 48/50 [04:03<00:10,  5.11s/it]

0.6932145092222426


 98%|█████████▊| 49/50 [04:08<00:05,  5.11s/it]

0.6923619641198052


100%|██████████| 50/50 [04:13<00:00,  5.08s/it]

0.6917287906010946
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [32]:
model_score2

{'Bleu_1': 0.5375623017670803,
 'Bleu_2': 0.3963209404044298,
 'Bleu_3': 0.3137641581215164,
 'Bleu_4': 0.25951132713831176,
 'METEOR': 0.23242995331745267,
 'ROUGE_L': 0.4448378267442185,
 'CIDEr': 1.3824513308578386,
 'SPICE': 0.2936052517116219,
 'USC_similarity': 0.5183357046342103}

In [33]:
caption_model3, model_score3 = cross_validation(cv[2][0], cv[2][1], 3)    

Split 3:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 4/8333 [00:00<03:29, 39.74it/s]

preprocessed words 2714 ==> 890
The vocabulary size is 891.
800 out of 891 words are found in the pre-trained matrix.
The size of embedding_matrix is (891, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:34<00:00, 38.91it/s]
  0%|          | 5/2083 [00:00<00:49, 41.85it/s]


Generating set took: 0:03:34.18


100%|██████████| 2083/2083 [00:50<00:00, 40.93it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:00:50.89
Training...


  3%|▎         | 1/30 [00:05<02:27,  5.10s/it]

6.036721600426568


  7%|▋         | 2/30 [00:10<02:22,  5.10s/it]

4.386879603068034


 10%|█         | 3/30 [00:15<02:17,  5.09s/it]

3.3394628365834556


 13%|█▎        | 4/30 [00:20<02:12,  5.09s/it]

2.674030754301283


 17%|█▋        | 5/30 [00:25<02:07,  5.09s/it]

2.3253944185045032


 20%|██        | 6/30 [00:30<02:02,  5.08s/it]

2.079367068078783


 23%|██▎       | 7/30 [00:35<01:56,  5.08s/it]

1.8835263384713068


 27%|██▋       | 8/30 [00:40<01:51,  5.09s/it]

1.7291336721844144


 30%|███       | 9/30 [00:45<01:46,  5.09s/it]

1.6153672403759427


 33%|███▎      | 10/30 [00:50<01:41,  5.09s/it]

1.5121021138297186


 37%|███▋      | 11/30 [00:55<01:36,  5.08s/it]

1.4272979233000014


 40%|████      | 12/30 [01:01<01:31,  5.08s/it]

1.3595674435297649


 43%|████▎     | 13/30 [01:06<01:26,  5.07s/it]

1.3143311606513128


 47%|████▋     | 14/30 [01:11<01:21,  5.08s/it]

1.2530048025978937


 50%|█████     | 15/30 [01:16<01:16,  5.08s/it]

1.1955829991234674


 53%|█████▎    | 16/30 [01:21<01:11,  5.08s/it]

1.137938380241394


 57%|█████▋    | 17/30 [01:26<01:06,  5.09s/it]

1.0924296114179823


 60%|██████    | 18/30 [01:31<01:00,  5.08s/it]

1.060634930928548


 63%|██████▎   | 19/30 [01:36<00:55,  5.08s/it]

1.0312486953205533


 67%|██████▋   | 20/30 [01:41<00:51,  5.16s/it]

1.0055073499679565


 70%|███████   | 21/30 [01:47<00:46,  5.14s/it]

0.9789519111315409


 73%|███████▎  | 22/30 [01:52<00:40,  5.12s/it]

0.9509139723247952


 77%|███████▋  | 23/30 [01:57<00:35,  5.11s/it]

0.920818911658393


 80%|████████  | 24/30 [02:02<00:30,  5.11s/it]

0.8975097669495476


 83%|████████▎ | 25/30 [02:07<00:25,  5.10s/it]

0.87204067574607


 87%|████████▋ | 26/30 [02:12<00:20,  5.09s/it]

0.8499553402264913


 90%|█████████ | 27/30 [02:17<00:15,  5.08s/it]

0.8282274537616305


 93%|█████████▎| 28/30 [02:22<00:10,  5.09s/it]

0.8242297040091621


 97%|█████████▋| 29/30 [02:27<00:05,  5.09s/it]

0.825322601530287


100%|██████████| 30/30 [02:32<00:00,  5.09s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

0.8158903585539924


  2%|▏         | 1/50 [00:05<04:08,  5.07s/it]

0.7804175151718987


  4%|▍         | 2/50 [00:10<04:03,  5.08s/it]

0.7693461312188042


  6%|▌         | 3/50 [00:15<03:58,  5.08s/it]

0.7581641475359598


  8%|▊         | 4/50 [00:20<03:53,  5.07s/it]

0.7486639155281914


 10%|█         | 5/50 [00:25<03:48,  5.08s/it]

0.7421289682388306


 12%|█▏        | 6/50 [00:30<03:43,  5.07s/it]

0.7376696003807915


 14%|█▍        | 7/50 [00:35<03:38,  5.07s/it]

0.7357618543836806


 16%|█▌        | 8/50 [00:40<03:33,  5.08s/it]

0.7304403252071805


 18%|█▊        | 9/50 [00:45<03:28,  5.08s/it]

0.7303104334407382


 20%|██        | 10/50 [00:50<03:23,  5.10s/it]

0.7279243734147813


 22%|██▏       | 11/50 [00:55<03:18,  5.09s/it]

0.7266147401597765


 24%|██▍       | 12/50 [01:01<03:13,  5.09s/it]

0.7252599994341532


 26%|██▌       | 13/50 [01:06<03:08,  5.10s/it]

0.7227727505895827


 28%|██▊       | 14/50 [01:11<03:03,  5.10s/it]

0.7212232417530484


 30%|███       | 15/50 [01:16<02:57,  5.08s/it]

0.7219040658738878


 32%|███▏      | 16/50 [01:21<02:52,  5.08s/it]

0.719433605670929


 34%|███▍      | 17/50 [01:26<02:47,  5.09s/it]

0.7196785940064324


 36%|███▌      | 18/50 [01:31<02:42,  5.08s/it]

0.7187272177802192


 38%|███▊      | 19/50 [01:36<02:37,  5.07s/it]

0.7170231938362122


 40%|████      | 20/50 [01:41<02:32,  5.07s/it]

0.7166860169834561


 42%|████▏     | 21/50 [01:46<02:26,  5.07s/it]

0.7153201566802131


 44%|████▍     | 22/50 [01:51<02:21,  5.07s/it]

0.7146689295768738


 46%|████▌     | 23/50 [01:56<02:17,  5.08s/it]

0.7144653797149658


 48%|████▊     | 24/50 [02:01<02:12,  5.08s/it]

0.7136024170451694


 50%|█████     | 25/50 [02:07<02:06,  5.07s/it]

0.7124623656272888


 52%|█████▏    | 26/50 [02:12<02:02,  5.08s/it]

0.7112024691369798


 54%|█████▍    | 27/50 [02:17<01:56,  5.08s/it]

0.7113759517669678


 56%|█████▌    | 28/50 [02:22<01:51,  5.08s/it]

0.7110510600937737


 58%|█████▊    | 29/50 [02:27<01:46,  5.09s/it]

0.7092278136147393


 60%|██████    | 30/50 [02:32<01:41,  5.09s/it]

0.7092705633905199


 62%|██████▏   | 31/50 [02:37<01:36,  5.11s/it]

0.7086585428979661


 64%|██████▍   | 32/50 [02:42<01:31,  5.11s/it]

0.708018958568573


 66%|██████▌   | 33/50 [02:47<01:26,  5.10s/it]

0.708020395702786


 68%|██████▊   | 34/50 [02:52<01:21,  5.10s/it]

0.7069007092052035


 70%|███████   | 35/50 [02:58<01:16,  5.10s/it]

0.7062425679630704


 72%|███████▏  | 36/50 [03:03<01:11,  5.11s/it]

0.7055293983883328


 74%|███████▍  | 37/50 [03:08<01:06,  5.11s/it]

0.7050903373294406


 76%|███████▌  | 38/50 [03:13<01:01,  5.11s/it]

0.7051776978704665


 78%|███████▊  | 39/50 [03:18<00:56,  5.10s/it]

0.7031147744920518


 80%|████████  | 40/50 [03:23<00:50,  5.10s/it]

0.7032250033484565


 82%|████████▏ | 41/50 [03:28<00:45,  5.09s/it]

0.7026833693186442


 84%|████████▍ | 42/50 [03:33<00:40,  5.08s/it]

0.7018557058440315


 86%|████████▌ | 43/50 [03:38<00:35,  5.08s/it]

0.7021646499633789


 88%|████████▊ | 44/50 [03:43<00:30,  5.08s/it]

0.7004023856586881


 90%|█████████ | 45/50 [03:48<00:25,  5.08s/it]

0.7015586230489943


 92%|█████████▏| 46/50 [03:54<00:20,  5.09s/it]

0.6993337074915568


 94%|█████████▍| 47/50 [03:59<00:15,  5.15s/it]

0.6995649072859023


 96%|█████████▌| 48/50 [04:04<00:10,  5.12s/it]

0.6990793016221788


 98%|█████████▊| 49/50 [04:09<00:05,  5.10s/it]

0.6987208392884996


100%|██████████| 50/50 [04:14<00:00,  5.09s/it]

0.6988086965348985
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [34]:
model_score3

{'Bleu_1': 0.5400832668969554,
 'Bleu_2': 0.40191724141601487,
 'Bleu_3': 0.318971744032124,
 'Bleu_4': 0.26263663111889174,
 'METEOR': 0.24196269742096266,
 'ROUGE_L': 0.4570226739274497,
 'CIDEr': 1.4295462283110911,
 'SPICE': 0.3058347075034665,
 'USC_similarity': 0.5318594089282236}

In [35]:
caption_model4, model_score4 = cross_validation(cv[3][0], cv[3][1], 4)    

Split 4:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 5/8333 [00:00<03:26, 40.29it/s]

preprocessed words 2680 ==> 894
The vocabulary size is 895.
806 out of 895 words are found in the pre-trained matrix.
The size of embedding_matrix is (895, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:20<00:00, 41.47it/s]
  0%|          | 5/2083 [00:00<00:49, 41.93it/s]


Generating set took: 0:03:20.92


100%|██████████| 2083/2083 [00:49<00:00, 41.70it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:00:49.95
Training...


  3%|▎         | 1/30 [00:05<02:26,  5.04s/it]

6.931716018252903


  7%|▋         | 2/30 [00:10<02:21,  5.05s/it]

3.994262589348687


 10%|█         | 3/30 [00:15<02:16,  5.05s/it]

2.8860939343770347


 13%|█▎        | 4/30 [00:20<02:11,  5.06s/it]

2.420558902952406


 17%|█▋        | 5/30 [00:25<02:06,  5.06s/it]

2.1264171070522733


 20%|██        | 6/30 [00:30<02:01,  5.08s/it]

1.905994971593221


 23%|██▎       | 7/30 [00:35<01:57,  5.09s/it]

1.7428165409300063


 27%|██▋       | 8/30 [00:40<01:52,  5.09s/it]

1.6276541948318481


 30%|███       | 9/30 [00:45<01:46,  5.09s/it]

1.5124722454282973


 33%|███▎      | 10/30 [00:50<01:41,  5.09s/it]

1.4189695914586384


 37%|███▋      | 11/30 [00:55<01:36,  5.09s/it]

1.3381907675001357


 40%|████      | 12/30 [01:00<01:31,  5.08s/it]

1.2806763119167752


 43%|████▎     | 13/30 [01:06<01:26,  5.08s/it]

1.237110608153873


 47%|████▋     | 14/30 [01:11<01:21,  5.08s/it]

1.1755017042160034


 50%|█████     | 15/30 [01:16<01:16,  5.08s/it]

1.1233673095703125


 53%|█████▎    | 16/30 [01:21<01:11,  5.07s/it]

1.079861217074924


 57%|█████▋    | 17/30 [01:26<01:05,  5.08s/it]

1.0331070025761921


 60%|██████    | 18/30 [01:31<01:01,  5.16s/it]

0.9984408418337504


 63%|██████▎   | 19/30 [01:36<00:56,  5.14s/it]

0.9768891268306308


 67%|██████▋   | 20/30 [01:41<00:51,  5.12s/it]

0.9570923381381564


 70%|███████   | 21/30 [01:46<00:45,  5.11s/it]

0.9315115676985847


 73%|███████▎  | 22/30 [01:52<00:40,  5.10s/it]

0.892050994767083


 77%|███████▋  | 23/30 [01:57<00:35,  5.09s/it]

0.8604616589016385


 80%|████████  | 24/30 [02:02<00:30,  5.09s/it]

0.8314677874247233


 83%|████████▎ | 25/30 [02:07<00:25,  5.08s/it]

0.8134066727426317


 87%|████████▋ | 26/30 [02:12<00:20,  5.08s/it]

0.8025345537397597


 90%|█████████ | 27/30 [02:17<00:15,  5.08s/it]

0.7791172001096938


 93%|█████████▎| 28/30 [02:22<00:10,  5.09s/it]

0.7654377023379008


 97%|█████████▋| 29/30 [02:27<00:05,  5.09s/it]

0.750419921345181


100%|██████████| 30/30 [02:32<00:00,  5.09s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

0.7384402751922607


  2%|▏         | 1/50 [00:05<04:08,  5.06s/it]

0.710514505704244


  4%|▍         | 2/50 [00:10<04:03,  5.08s/it]

0.7016627391179403


  6%|▌         | 3/50 [00:15<03:58,  5.08s/it]

0.6914570265346103


  8%|▊         | 4/50 [00:20<03:54,  5.09s/it]

0.6855711473359002


 10%|█         | 5/50 [00:25<03:49,  5.09s/it]

0.6811454627248976


 12%|█▏        | 6/50 [00:30<03:43,  5.08s/it]

0.6770553323957655


 14%|█▍        | 7/50 [00:35<03:38,  5.08s/it]

0.6741738650533888


 16%|█▌        | 8/50 [00:40<03:33,  5.07s/it]

0.6725365122159322


 18%|█▊        | 9/50 [00:45<03:28,  5.08s/it]

0.6706080105569627


 20%|██        | 10/50 [00:50<03:23,  5.08s/it]

0.6677229801813761


 22%|██▏       | 11/50 [00:55<03:18,  5.08s/it]

0.667349186208513


 24%|██▍       | 12/50 [01:00<03:12,  5.07s/it]

0.6668997274504768


 26%|██▌       | 13/50 [01:06<03:07,  5.08s/it]

0.6643171707789103


 28%|██▊       | 14/50 [01:11<03:02,  5.08s/it]

0.6637296279271444


 30%|███       | 15/50 [01:16<02:57,  5.08s/it]

0.6623561316066318


 32%|███▏      | 16/50 [01:21<02:52,  5.09s/it]

0.6613907085524665


 34%|███▍      | 17/50 [01:26<02:48,  5.09s/it]

0.6607281764348348


 36%|███▌      | 18/50 [01:31<02:43,  5.09s/it]

0.6594557166099548


 38%|███▊      | 19/50 [01:36<02:37,  5.10s/it]

0.6586964726448059


 40%|████      | 20/50 [01:41<02:32,  5.10s/it]

0.6581335730022855


 42%|████▏     | 21/50 [01:46<02:27,  5.10s/it]

0.6574283242225647


 44%|████▍     | 22/50 [01:51<02:22,  5.10s/it]

0.656683623790741


 46%|████▌     | 23/50 [01:57<02:17,  5.10s/it]

0.6550869743029276


 48%|████▊     | 24/50 [02:02<02:12,  5.10s/it]

0.6549529896842109


 50%|█████     | 25/50 [02:07<02:07,  5.10s/it]

0.6549978587362502


 52%|█████▏    | 26/50 [02:12<02:02,  5.10s/it]

0.6532548930909898


 54%|█████▍    | 27/50 [02:17<01:57,  5.09s/it]

0.6527515848477682


 56%|█████▌    | 28/50 [02:22<01:52,  5.09s/it]

0.652550545003679


 58%|█████▊    | 29/50 [02:27<01:46,  5.09s/it]

0.6521298156844245


 60%|██████    | 30/50 [02:32<01:41,  5.09s/it]

0.6510079701741537


 62%|██████▏   | 31/50 [02:37<01:36,  5.08s/it]

0.6512129969067044


 64%|██████▍   | 32/50 [02:42<01:31,  5.09s/it]

0.6505772074063619


 66%|██████▌   | 33/50 [02:47<01:26,  5.09s/it]

0.6499259140756395


 68%|██████▊   | 34/50 [02:53<01:21,  5.10s/it]

0.6486457917425368


 70%|███████   | 35/50 [02:58<01:16,  5.10s/it]

0.6481686962975396


 72%|███████▏  | 36/50 [03:03<01:11,  5.10s/it]

0.6482349766625298


 74%|███████▍  | 37/50 [03:08<01:06,  5.10s/it]

0.6479015813933479


 76%|███████▌  | 38/50 [03:13<01:01,  5.09s/it]

0.6464860306845771


 78%|███████▊  | 39/50 [03:18<00:55,  5.09s/it]

0.6455076932907104


 80%|████████  | 40/50 [03:23<00:50,  5.10s/it]

0.6464807258711921


 82%|████████▏ | 41/50 [03:28<00:45,  5.10s/it]

0.6450430419709947


 84%|████████▍ | 42/50 [03:33<00:40,  5.09s/it]

0.6445123884412978


 86%|████████▌ | 43/50 [03:38<00:35,  5.08s/it]

0.6448164184888204


 88%|████████▊ | 44/50 [03:43<00:30,  5.07s/it]

0.6430481804741753


 90%|█████████ | 45/50 [03:49<00:25,  5.08s/it]

0.6433997882737054


 92%|█████████▏| 46/50 [03:54<00:20,  5.15s/it]

0.6436279813448588


 94%|█████████▍| 47/50 [03:59<00:15,  5.13s/it]

0.6426680088043213


 96%|█████████▌| 48/50 [04:04<00:10,  5.11s/it]

0.6415058374404907


 98%|█████████▊| 49/50 [04:09<00:05,  5.11s/it]

0.6416170729531182


100%|██████████| 50/50 [04:14<00:00,  5.09s/it]

0.6413949728012085
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [36]:
model_score4

{'Bleu_1': 0.5385183575928508,
 'Bleu_2': 0.401664355839296,
 'Bleu_3': 0.31965552329358043,
 'Bleu_4': 0.2636170076299316,
 'METEOR': 0.24291759910549474,
 'ROUGE_L': 0.453435001145289,
 'CIDEr': 1.3581838537245277,
 'SPICE': 0.3019722670054876,
 'USC_similarity': 0.5283644756782876}

In [37]:
caption_model5, model_score5 = cross_validation(cv[4][0], cv[4][1], 5)    

Split 5:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 4/8333 [00:00<03:39, 37.96it/s]

preprocessed words 2657 ==> 905
The vocabulary size is 906.
815 out of 906 words are found in the pre-trained matrix.
The size of embedding_matrix is (906, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:37<00:00, 38.24it/s]
  0%|          | 4/2083 [00:00<00:54, 38.14it/s]


Generating set took: 0:03:37.94


100%|██████████| 2083/2083 [00:55<00:00, 37.64it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:00:55.34
Training...


  3%|▎         | 1/30 [00:05<02:28,  5.11s/it]

6.339605437384711


  7%|▋         | 2/30 [00:10<02:23,  5.11s/it]

4.249551428688897


 10%|█         | 3/30 [00:15<02:17,  5.10s/it]

3.2269477314419217


 13%|█▎        | 4/30 [00:20<02:12,  5.10s/it]

2.6042593585120306


 17%|█▋        | 5/30 [00:25<02:07,  5.10s/it]

2.2792451911502414


 20%|██        | 6/30 [00:30<02:02,  5.10s/it]

2.042784425947401


 23%|██▎       | 7/30 [00:35<01:57,  5.10s/it]

1.8613771729999118


 27%|██▋       | 8/30 [00:40<01:52,  5.10s/it]

1.721433851453993


 30%|███       | 9/30 [00:45<01:47,  5.10s/it]

1.600182228618198


 33%|███▎      | 10/30 [00:50<01:41,  5.10s/it]

1.508430626657274


 37%|███▋      | 11/30 [00:56<01:36,  5.10s/it]

1.4327043559816148


 40%|████      | 12/30 [01:01<01:31,  5.11s/it]

1.354536188973321


 43%|████▎     | 13/30 [01:06<01:26,  5.11s/it]

1.2796898285547893


 47%|████▋     | 14/30 [01:11<01:21,  5.12s/it]

1.2129360834757488


 50%|█████     | 15/30 [01:16<01:16,  5.11s/it]

1.151537471347385


 53%|█████▎    | 16/30 [01:21<01:11,  5.11s/it]

1.105354905128479


 57%|█████▋    | 17/30 [01:26<01:06,  5.11s/it]

1.0664195948176913


 60%|██████    | 18/30 [01:31<01:01,  5.13s/it]

1.0317992965380351


 63%|██████▎   | 19/30 [01:37<00:56,  5.12s/it]

0.9967821968926324


 67%|██████▋   | 20/30 [01:42<00:51,  5.11s/it]

0.962202231089274


 70%|███████   | 21/30 [01:47<00:45,  5.11s/it]

0.9288284844822354


 73%|███████▎  | 22/30 [01:52<00:40,  5.11s/it]

0.9204189512464735


 77%|███████▋  | 23/30 [01:57<00:35,  5.11s/it]

0.9239352676603529


 80%|████████  | 24/30 [02:02<00:30,  5.11s/it]

0.9014716810650296


 83%|████████▎ | 25/30 [02:07<00:25,  5.14s/it]

0.8708501590622796


 87%|████████▋ | 26/30 [02:12<00:20,  5.14s/it]

0.8333869510226779


 90%|█████████ | 27/30 [02:18<00:15,  5.13s/it]

0.8048171798388163


 93%|█████████▎| 28/30 [02:23<00:10,  5.12s/it]

0.780169771777259


 97%|█████████▋| 29/30 [02:28<00:05,  5.12s/it]

0.7688845462269254


100%|██████████| 30/30 [02:33<00:00,  5.11s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

0.7604452504052056


  2%|▏         | 1/50 [00:05<04:10,  5.11s/it]

0.7265607251061333


  4%|▍         | 2/50 [00:10<04:05,  5.11s/it]

0.7118180063035753


  6%|▌         | 3/50 [00:15<04:00,  5.12s/it]

0.6999521719084846


  8%|▊         | 4/50 [00:20<03:55,  5.12s/it]

0.6924256417486403


 10%|█         | 5/50 [00:25<03:49,  5.11s/it]

0.6886884636349149


 12%|█▏        | 6/50 [00:30<03:44,  5.11s/it]

0.6858535872565376


 14%|█▍        | 7/50 [00:35<03:39,  5.10s/it]

0.6831332908736335


 16%|█▌        | 8/50 [00:40<03:34,  5.10s/it]

0.6812933617168002


 18%|█▊        | 9/50 [00:45<03:28,  5.10s/it]

0.6811629666222466


 20%|██        | 10/50 [00:51<03:24,  5.11s/it]

0.678140229649014


 22%|██▏       | 11/50 [00:56<03:19,  5.11s/it]

0.6779323021570841


 24%|██▍       | 12/50 [01:01<03:13,  5.10s/it]

0.6772496435377333


 26%|██▌       | 13/50 [01:06<03:08,  5.11s/it]

0.6750216219160292


 28%|██▊       | 14/50 [01:11<03:03,  5.10s/it]

0.6760129928588867


 30%|███       | 15/50 [01:16<02:58,  5.10s/it]

0.6742182903819613


 32%|███▏      | 16/50 [01:21<02:53,  5.11s/it]

0.6728075610266792


 34%|███▍      | 17/50 [01:26<02:48,  5.10s/it]

0.6724045938915677


 36%|███▌      | 18/50 [01:31<02:43,  5.11s/it]

0.6707708835601807


 38%|███▊      | 19/50 [01:37<02:38,  5.11s/it]

0.670495867729187


 40%|████      | 20/50 [01:42<02:33,  5.11s/it]

0.6692971587181091


 42%|████▏     | 21/50 [01:47<02:27,  5.10s/it]

0.6697915527555678


 44%|████▍     | 22/50 [01:52<02:23,  5.13s/it]

0.6686988340483772


 46%|████▌     | 23/50 [01:57<02:18,  5.12s/it]

0.6672963831159804


 48%|████▊     | 24/50 [02:02<02:13,  5.12s/it]

0.6678453087806702


 50%|█████     | 25/50 [02:07<02:07,  5.11s/it]

0.6670043733384874


 52%|█████▏    | 26/50 [02:12<02:02,  5.12s/it]

0.6664998133977255


 54%|█████▍    | 27/50 [02:17<01:57,  5.11s/it]

0.6653041574690077


 56%|█████▌    | 28/50 [02:23<01:52,  5.11s/it]

0.664207829369439


 58%|█████▊    | 29/50 [02:28<01:47,  5.10s/it]

0.6644598311848111


 60%|██████    | 30/50 [02:33<01:42,  5.11s/it]

0.6637970407803854


 62%|██████▏   | 31/50 [02:38<01:37,  5.11s/it]

0.6634748776753744


 64%|██████▍   | 32/50 [02:43<01:31,  5.11s/it]

0.6622439622879028


 66%|██████▌   | 33/50 [02:48<01:26,  5.11s/it]

0.6621544029977586


 68%|██████▊   | 34/50 [02:53<01:21,  5.11s/it]

0.6612168881628249


 70%|███████   | 35/50 [02:58<01:16,  5.10s/it]

0.6610119673940871


 72%|███████▏  | 36/50 [03:03<01:11,  5.10s/it]

0.6612032585673862


 74%|███████▍  | 37/50 [03:09<01:06,  5.11s/it]

0.6605443822013007


 76%|███████▌  | 38/50 [03:14<01:01,  5.11s/it]

0.6595233082771301


 78%|███████▊  | 39/50 [03:19<00:56,  5.11s/it]

0.6597828798823886


 80%|████████  | 40/50 [03:24<00:51,  5.11s/it]

0.6591713362269931


 82%|████████▏ | 41/50 [03:29<00:46,  5.11s/it]

0.6582615640428331


 84%|████████▍ | 42/50 [03:34<00:40,  5.11s/it]

0.6581878463427225


 86%|████████▌ | 43/50 [03:39<00:35,  5.11s/it]

0.6576003630956014


 88%|████████▊ | 44/50 [03:44<00:30,  5.11s/it]

0.6575357516606649


 90%|█████████ | 45/50 [03:49<00:25,  5.11s/it]

0.6565239826838175


 92%|█████████▏| 46/50 [03:55<00:20,  5.11s/it]

0.6564567751354642


 94%|█████████▍| 47/50 [04:00<00:15,  5.10s/it]

0.655600110689799


 96%|█████████▌| 48/50 [04:05<00:10,  5.11s/it]

0.6549380554093255


 98%|█████████▊| 49/50 [04:10<00:05,  5.11s/it]

0.6546445157792833


100%|██████████| 50/50 [04:15<00:00,  5.11s/it]

0.6542874972025553
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [38]:
model_score5

{'Bleu_1': 0.5391378843899116,
 'Bleu_2': 0.40305377539010917,
 'Bleu_3': 0.3223556718709861,
 'Bleu_4': 0.2683234256507227,
 'METEOR': 0.23510841121944365,
 'ROUGE_L': 0.44604607276402924,
 'CIDEr': 1.3759996137393877,
 'SPICE': 0.2893594562940821,
 'USC_similarity': 0.5212505211899853}

In [39]:
model_scores = defaultdict(list)
for scores in [model_score1, model_score2, model_score3, model_score4, model_score5]:
    for key, value in scores.items():
        model_scores[key].append(value)

In [40]:
model_scores

defaultdict(list,
            {'Bleu_1': [0.5420602587634623,
              0.5375623017670803,
              0.5400832668969554,
              0.5385183575928508,
              0.5391378843899116],
             'Bleu_2': [0.3944384687544192,
              0.3963209404044298,
              0.40191724141601487,
              0.401664355839296,
              0.40305377539010917],
             'Bleu_3': [0.3076348082784494,
              0.3137641581215164,
              0.318971744032124,
              0.31965552329358043,
              0.3223556718709861],
             'Bleu_4': [0.2505738856582193,
              0.25951132713831176,
              0.26263663111889174,
              0.2636170076299316,
              0.2683234256507227],
             'METEOR': [0.2305141226280994,
              0.23242995331745267,
              0.24196269742096266,
              0.24291759910549474,
              0.23510841121944365],
             'ROUGE_L': [0.44552871905314234,
              0.44483782

In [41]:
tag = '15.1'
with open(f'{root_captioning}/fz_notebooks/cv_n{tag}.json', 'w') as fp:
    json.dump(model_scores, fp)