## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

sys.path.append('../../scr/evaluation')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)



In [7]:
all_paths = train_paths.copy()
all_paths.extend(test_paths.copy())
all_paths = np.array(all_paths)

all_descriptions = train_descriptions.copy()
all_descriptions.extend(test_descriptions.copy())
all_descriptions = np.array(all_descriptions)

captions = all_descriptions.copy()
max_length_all = max(max_length_train, max_length_test)
max_length = max_length_all + 2
      
lex = set()
for sen in all_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in all_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(all_paths)} images') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')


There are 10416 images
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.


In [8]:
all_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
all_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Wikipedia2vec Embeddings

In [10]:
# read the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
    embeddings_index = json.load(file)

In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count = 0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        # inception v3 expects (299, 299) sized images
        self.model = models.inception_v3(pretrained=pretrained, aux_logits=False)
        # remove the classification layer
        self.model =\
        nn.Sequential(
            *(list(self.model.children())[: 3]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[3: 5]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[5: -1])
        )

        self.input_size = 299

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
          # set the model to evaluation model
          self.model.eval()

        # N x 3 x 299 x 299
        features = self.model(img_input)
        # N x 2048 x 8 x 8

        return features

In [14]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
              'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [15]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

### Train the Neural Network

In [16]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        )
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [17]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img, in_seq


In [18]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [19]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [20]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in tqdm(img_paths):

        img_features.append(
            F.adaptive_avg_pool2d(
                (encode_image(model, image_path).cpu()), 
                (1, 1)
            ).squeeze().data.numpy()
        )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [21]:
def get_train_test(
    encoder,
    train_paths,
    test_paths,
    cnn_type='inception_v3',
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    return train_img_features, test_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    cnn_type='inception_v3',
    hidden_size=256,
):

    caption_model = CaptionModel(
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=hidden_size,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 7)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 7)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [22]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [23]:
def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [24]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [25]:
cnn_type = 'inception_v3'
encoder = CNNModel(pretrained=True)
encoder.to(device)

CNNModel(
  (model): Sequential(
    (0): BasicConv2d(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicConv2d(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): BasicConv2d(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): BasicConv2d(
      (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): BasicConv2d(
      (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1

In [26]:
def cross_validation(train_index, test_index, count):
    print('=' * 60)
    print(f'Split {count}:')
    print(f'Splitting data...')
    
    train_paths, test_paths = all_paths[train_index], all_paths[test_index]
    train_descriptions, test_descriptions = all_descriptions[train_index], all_descriptions[test_index]
    print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')
    
    vocab = get_vocab(train_descriptions, word_count_threshold=10)
    idxtoword, wordtoidx = get_word_dict(vocab)
    vocab_size = get_vocab_size(idxtoword)
    embedding_dim = 500
    embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

    print(f'Preparing dataloader...')
    train_img_features, test_img_features = get_train_test(encoder, train_paths, test_paths)

    train_loader = get_train_dataloader(
        train_descriptions, 
        train_img_features,
        wordtoidx,
        max_length,
        batch_size=1000
    )

    print(f'Training...')
    caption_model = train_model(
        train_loader,
        vocab_size,
        embedding_dim, 
        embedding_matrix
    )

    
    ref = captions[test_index]
    model_score = evaluate_results(
        test_img_features, 
        caption_model,
        ref,
        max_length,
        vocab_size,
        wordtoidx,
        idxtoword
    )
    
    return caption_model, model_score

In [27]:
cv = KFold(n_splits=5, random_state=123, shuffle=True)
cv = [(train_index, test_index) for train_index, test_index in cv.split(all_paths)]  

In [28]:
caption_model1, model_score1 = cross_validation(cv[0][0], cv[0][1], 1)    

Split 1:
Splitting data...
8332 images for training and 2084 images for testing.
There are 41660 captions


  0%|          | 0/8332 [00:00<?, ?it/s]

preprocessed words 2659 ==> 884
The vocabulary size is 885.
793 out of 885 words are found in the pre-trained matrix.
The size of embedding_matrix is (885, 500)
Preparing dataloader...


100%|██████████| 8332/8332 [03:27<00:00, 40.14it/s]
  0%|          | 5/2084 [00:00<00:49, 41.87it/s]


Generating set took: 0:03:27.57


100%|██████████| 2084/2084 [00:51<00:00, 40.14it/s]
  0%|          | 0/70 [00:00<?, ?it/s]


Generating set took: 0:00:51.91
Training...


  1%|▏         | 1/70 [00:00<00:52,  1.32it/s]

6.681885401407878


  3%|▎         | 2/70 [00:01<00:51,  1.32it/s]

4.699528058369954


  4%|▍         | 3/70 [00:02<00:50,  1.33it/s]

4.108570920096503


  6%|▌         | 4/70 [00:02<00:49,  1.34it/s]

3.2977338367038302


  7%|▋         | 5/70 [00:03<00:48,  1.34it/s]

2.761914014816284


  9%|▊         | 6/70 [00:04<00:47,  1.36it/s]

2.4345292250315347


 10%|█         | 7/70 [00:05<00:46,  1.37it/s]

2.2043461534712048


 11%|█▏        | 8/70 [00:05<00:45,  1.37it/s]

2.0163062810897827


 13%|█▎        | 9/70 [00:06<00:44,  1.36it/s]

1.851974990632799


 14%|█▍        | 10/70 [00:07<00:43,  1.37it/s]

1.7113094859653049


 16%|█▌        | 11/70 [00:08<00:43,  1.35it/s]

1.5794241825739543


 17%|█▋        | 12/70 [00:08<00:42,  1.37it/s]

1.4863482846154108


 19%|█▊        | 13/70 [00:09<00:41,  1.37it/s]

1.4190907610787287


 20%|██        | 14/70 [00:10<00:41,  1.36it/s]

1.3056254982948303


 21%|██▏       | 15/70 [00:11<00:40,  1.34it/s]

1.2248575025134616


 23%|██▎       | 16/70 [00:11<00:41,  1.31it/s]

1.161313858297136


 24%|██▍       | 17/70 [00:12<00:40,  1.32it/s]

1.1048289868566725


 26%|██▌       | 18/70 [00:13<00:39,  1.32it/s]

1.0436455474959478


 27%|██▋       | 19/70 [00:14<00:41,  1.24it/s]

0.985226591428121


 29%|██▊       | 20/70 [00:15<00:39,  1.27it/s]

0.9442082179917229


 30%|███       | 21/70 [00:15<00:38,  1.29it/s]

0.912776894039578


 31%|███▏      | 22/70 [00:16<00:36,  1.30it/s]

0.8723116914431254


 33%|███▎      | 23/70 [00:17<00:35,  1.32it/s]

0.8380623923407661


 34%|███▍      | 24/70 [00:18<00:34,  1.33it/s]

0.8072042200300429


 36%|███▌      | 25/70 [00:18<00:33,  1.33it/s]

0.7779082225428687


 37%|███▋      | 26/70 [00:19<00:32,  1.34it/s]

0.7676344712575277


 39%|███▊      | 27/70 [00:20<00:31,  1.35it/s]

0.7696735494666629


 40%|████      | 28/70 [00:20<00:31,  1.35it/s]

0.7547256648540497


 41%|████▏     | 29/70 [00:21<00:30,  1.35it/s]

0.7336900962723626


 43%|████▎     | 30/70 [00:22<00:29,  1.37it/s]

0.7167066600587633


 44%|████▍     | 31/70 [00:23<00:28,  1.37it/s]

0.7021142542362213


 46%|████▌     | 32/70 [00:23<00:28,  1.35it/s]

0.6766431861453586


 47%|████▋     | 33/70 [00:24<00:27,  1.35it/s]

0.6686417427327898


 49%|████▊     | 34/70 [00:25<00:26,  1.34it/s]

0.6743496755758921


 50%|█████     | 35/70 [00:26<00:26,  1.33it/s]

0.67995994620853


 51%|█████▏    | 36/70 [00:26<00:25,  1.34it/s]

0.6673269503646426


 53%|█████▎    | 37/70 [00:27<00:24,  1.34it/s]

0.6813072893354628


 54%|█████▍    | 38/70 [00:28<00:23,  1.35it/s]

0.6474773850705888


 56%|█████▌    | 39/70 [00:29<00:22,  1.36it/s]

0.6166222327285342


 57%|█████▋    | 40/70 [00:29<00:21,  1.37it/s]

0.5931505461533865


 59%|█████▊    | 41/70 [00:30<00:21,  1.36it/s]

0.575894961754481


 60%|██████    | 42/70 [00:31<00:22,  1.26it/s]

0.5659505989816453


 61%|██████▏   | 43/70 [00:32<00:20,  1.29it/s]

0.5673260589440664


 63%|██████▎   | 44/70 [00:32<00:19,  1.31it/s]

0.5906681153509352


 64%|██████▍   | 45/70 [00:33<00:18,  1.33it/s]

0.5953251156542037


 66%|██████▌   | 46/70 [00:34<00:17,  1.35it/s]

0.5691631734371185


 67%|██████▋   | 47/70 [00:35<00:17,  1.34it/s]

0.5469681587484148


 69%|██████▊   | 48/70 [00:35<00:16,  1.34it/s]

0.5402135848999023


 70%|███████   | 49/70 [00:36<00:15,  1.34it/s]

0.5494650271203783


 71%|███████▏  | 50/70 [00:37<00:14,  1.34it/s]

0.5797813004917569


 73%|███████▎  | 51/70 [00:38<00:14,  1.35it/s]

0.5735781755712297


 74%|███████▍  | 52/70 [00:38<00:13,  1.35it/s]

0.5792549815442827


 76%|███████▌  | 53/70 [00:39<00:12,  1.35it/s]

0.563860340250863


 77%|███████▋  | 54/70 [00:40<00:11,  1.34it/s]

0.5671568380461799


 79%|███████▊  | 55/70 [00:41<00:11,  1.34it/s]

0.5754741761419508


 80%|████████  | 56/70 [00:41<00:10,  1.35it/s]

0.5774292018678453


 81%|████████▏ | 57/70 [00:42<00:09,  1.33it/s]

0.5518585244814554


 83%|████████▎ | 58/70 [00:43<00:08,  1.34it/s]

0.5503031777011024


 84%|████████▍ | 59/70 [00:44<00:08,  1.35it/s]

0.5336507757504781


 86%|████████▌ | 60/70 [00:44<00:07,  1.35it/s]

0.5318849086761475


 87%|████████▋ | 61/70 [00:45<00:06,  1.33it/s]

0.5161057843102349


 89%|████████▊ | 62/70 [00:46<00:05,  1.34it/s]

0.5025442408190833


 90%|█████████ | 63/70 [00:47<00:05,  1.34it/s]

0.4822605583402846


 91%|█████████▏| 64/70 [00:47<00:04,  1.33it/s]

0.4681151674853431


 93%|█████████▎| 65/70 [00:48<00:03,  1.34it/s]

0.46454125973913407


 94%|█████████▍| 66/70 [00:49<00:02,  1.35it/s]

0.4476219382550981


 96%|█████████▌| 67/70 [00:50<00:02,  1.34it/s]

0.4408857590622372


 97%|█████████▋| 68/70 [00:50<00:01,  1.33it/s]

0.43977422184414333


 99%|█████████▊| 69/70 [00:51<00:00,  1.32it/s]

0.4320753150516086


100%|██████████| 70/70 [00:52<00:00,  1.34it/s]
  0%|          | 0/70 [00:00<?, ?it/s]

0.4294012619389428


  1%|▏         | 1/70 [00:00<00:49,  1.40it/s]

0.4223472128311793


  3%|▎         | 2/70 [00:01<00:49,  1.38it/s]

0.4122272382179896


  4%|▍         | 3/70 [00:02<00:49,  1.36it/s]

0.4016886121696896


  6%|▌         | 4/70 [00:02<00:48,  1.35it/s]

0.39405447741349536


  7%|▋         | 5/70 [00:03<00:48,  1.34it/s]

0.3931756052705977


  9%|▊         | 6/70 [00:04<00:47,  1.34it/s]

0.3873211079173618


 10%|█         | 7/70 [00:05<00:46,  1.35it/s]

0.38777179188198513


 11%|█▏        | 8/70 [00:05<00:45,  1.36it/s]

0.38376223378711277


 13%|█▎        | 9/70 [00:06<00:45,  1.35it/s]

0.38335944215456647


 14%|█▍        | 10/70 [00:07<00:45,  1.33it/s]

0.3837795853614807


 16%|█▌        | 11/70 [00:08<00:44,  1.34it/s]

0.3827821695142322


 17%|█▋        | 12/70 [00:08<00:42,  1.35it/s]

0.38090214133262634


 19%|█▊        | 13/70 [00:09<00:42,  1.34it/s]

0.3792015512784322


 20%|██        | 14/70 [00:10<00:42,  1.32it/s]

0.3780930952893363


 21%|██▏       | 15/70 [00:11<00:41,  1.33it/s]

0.37759559022055733


 23%|██▎       | 16/70 [00:11<00:40,  1.34it/s]

0.3775731490717994


 24%|██▍       | 17/70 [00:12<00:39,  1.34it/s]

0.3780502925316493


 26%|██▌       | 18/70 [00:13<00:38,  1.36it/s]

0.37746930453512406


 27%|██▋       | 19/70 [00:14<00:37,  1.35it/s]

0.37614210446675617


 29%|██▊       | 20/70 [00:14<00:36,  1.36it/s]

0.37611308693885803


 30%|███       | 21/70 [00:15<00:35,  1.38it/s]

0.37471843428081936


 31%|███▏      | 22/70 [00:16<00:35,  1.36it/s]

0.3742524849043952


 33%|███▎      | 23/70 [00:17<00:34,  1.36it/s]

0.3749608016676373


 34%|███▍      | 24/70 [00:17<00:33,  1.35it/s]

0.3739414032962587


 36%|███▌      | 25/70 [00:18<00:33,  1.35it/s]

0.37229124704996747


 37%|███▋      | 26/70 [00:19<00:32,  1.34it/s]

0.3715609245830112


 39%|███▊      | 27/70 [00:20<00:32,  1.34it/s]

0.3703953872124354


 40%|████      | 28/70 [00:21<00:33,  1.24it/s]

0.3700324098269145


 41%|████▏     | 29/70 [00:21<00:32,  1.27it/s]

0.3714907070000966


 43%|████▎     | 30/70 [00:22<00:31,  1.28it/s]

0.3710649659236272


 44%|████▍     | 31/70 [00:23<00:30,  1.29it/s]

0.36879381868574357


 46%|████▌     | 32/70 [00:24<00:29,  1.30it/s]

0.36975741386413574


 47%|████▋     | 33/70 [00:24<00:28,  1.30it/s]

0.3700711346334881


 49%|████▊     | 34/70 [00:25<00:27,  1.31it/s]

0.3682246208190918


 50%|█████     | 35/70 [00:26<00:26,  1.32it/s]

0.36854667133755153


 51%|█████▏    | 36/70 [00:27<00:25,  1.32it/s]

0.369112104177475


 53%|█████▎    | 37/70 [00:27<00:25,  1.31it/s]

0.3691317488749822


 54%|█████▍    | 38/70 [00:28<00:24,  1.32it/s]

0.36795468793974984


 56%|█████▌    | 39/70 [00:29<00:23,  1.33it/s]

0.3692638956838184


 57%|█████▋    | 40/70 [00:30<00:22,  1.34it/s]

0.368716933661037


 59%|█████▊    | 41/70 [00:30<00:21,  1.34it/s]

0.36720195412635803


 60%|██████    | 42/70 [00:31<00:20,  1.34it/s]

0.36756497621536255


 61%|██████▏   | 43/70 [00:32<00:20,  1.33it/s]

0.3663494735956192


 63%|██████▎   | 44/70 [00:33<00:19,  1.34it/s]

0.3658570597569148


 64%|██████▍   | 45/70 [00:33<00:18,  1.34it/s]

0.3650111373927858


 66%|██████▌   | 46/70 [00:34<00:17,  1.34it/s]

0.36471258600552875


 67%|██████▋   | 47/70 [00:35<00:17,  1.35it/s]

0.36325541966491276


 69%|██████▊   | 48/70 [00:35<00:16,  1.35it/s]

0.36525993214713204


 70%|███████   | 49/70 [00:36<00:15,  1.36it/s]

0.365585606959131


 71%|███████▏  | 50/70 [00:37<00:14,  1.37it/s]

0.36415258877807194


 73%|███████▎  | 51/70 [00:38<00:13,  1.36it/s]

0.36592987345324623


 74%|███████▍  | 52/70 [00:38<00:13,  1.35it/s]

0.3627019789483812


 76%|███████▌  | 53/70 [00:39<00:12,  1.35it/s]

0.36541056964132523


 77%|███████▋  | 54/70 [00:40<00:11,  1.35it/s]

0.365200259619289


 79%|███████▊  | 55/70 [00:41<00:11,  1.35it/s]

0.36378654175334507


 80%|████████  | 56/70 [00:42<00:11,  1.25it/s]

0.36314278344313305


 81%|████████▏ | 57/70 [00:42<00:10,  1.28it/s]

0.36350626581245


 83%|████████▎ | 58/70 [00:43<00:09,  1.30it/s]

0.36258042520946926


 84%|████████▍ | 59/70 [00:44<00:08,  1.32it/s]

0.3640493866470125


 86%|████████▌ | 60/70 [00:45<00:07,  1.34it/s]

0.36168045467800564


 87%|████████▋ | 61/70 [00:45<00:06,  1.34it/s]

0.36091550522380406


 89%|████████▊ | 62/70 [00:46<00:05,  1.35it/s]

0.36145685613155365


 90%|█████████ | 63/70 [00:47<00:05,  1.35it/s]

0.36263638238112134


 91%|█████████▏| 64/70 [00:47<00:04,  1.35it/s]

0.36122914486461216


 93%|█████████▎| 65/70 [00:48<00:03,  1.35it/s]

0.36133811871210736


 94%|█████████▍| 66/70 [00:49<00:02,  1.35it/s]

0.3614266167084376


 96%|█████████▌| 67/70 [00:50<00:02,  1.35it/s]

0.3616013129552205


 97%|█████████▋| 68/70 [00:50<00:01,  1.37it/s]

0.35966357754336464


 99%|█████████▊| 69/70 [00:51<00:00,  1.37it/s]

0.3614441355069478


100%|██████████| 70/70 [00:52<00:00,  1.34it/s]

0.3590025206406911
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [29]:
model_score1

{'Bleu_1': 0.5636411749139736,
 'Bleu_2': 0.4289017507235894,
 'Bleu_3': 0.34622082479232946,
 'Bleu_4': 0.2899296757283083,
 'METEOR': 0.2539452072748442,
 'ROUGE_L': 0.4779333316068181,
 'CIDEr': 1.5019565209259307,
 'SPICE': 0.3205979405421817,
 'USC_similarity': 0.5450922469689821}

In [30]:
caption_model2, model_score2 = cross_validation(cv[1][0], cv[1][1], 2)    

Split 2:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 4/8333 [00:00<03:51, 36.04it/s]

preprocessed words 2688 ==> 916
The vocabulary size is 917.
819 out of 917 words are found in the pre-trained matrix.
The size of embedding_matrix is (917, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:26<00:00, 40.31it/s]
  0%|          | 5/2083 [00:00<00:49, 41.65it/s]


Generating set took: 0:03:26.71


100%|██████████| 2083/2083 [00:50<00:00, 41.32it/s]
  0%|          | 0/70 [00:00<?, ?it/s]


Generating set took: 0:00:50.41
Training...


  1%|▏         | 1/70 [00:00<00:50,  1.36it/s]

7.172003269195557


  3%|▎         | 2/70 [00:01<00:49,  1.37it/s]

5.158834563361274


  4%|▍         | 3/70 [00:02<00:48,  1.37it/s]

4.456582546234131


  6%|▌         | 4/70 [00:02<00:48,  1.37it/s]

4.06417081091139


  7%|▋         | 5/70 [00:03<00:46,  1.39it/s]

3.564630667368571


  9%|▊         | 6/70 [00:04<00:46,  1.38it/s]

3.1074612935384116


 10%|█         | 7/70 [00:05<00:45,  1.38it/s]

2.753003570768568


 11%|█▏        | 8/70 [00:05<00:44,  1.38it/s]

2.481764872868856


 13%|█▎        | 9/70 [00:06<00:44,  1.38it/s]

2.272186173333062


 14%|█▍        | 10/70 [00:07<00:43,  1.37it/s]

2.1038045618269177


 16%|█▌        | 11/70 [00:07<00:42,  1.37it/s]

1.9704118569691975


 17%|█▋        | 12/70 [00:08<00:42,  1.37it/s]

1.8442140685187445


 19%|█▊        | 13/70 [00:09<00:41,  1.38it/s]

1.74812732802497


 20%|██        | 14/70 [00:10<00:40,  1.37it/s]

1.6512512233522203


 21%|██▏       | 15/70 [00:10<00:40,  1.37it/s]

1.5598306920793321


 23%|██▎       | 16/70 [00:11<00:39,  1.38it/s]

1.4849018653233845


 24%|██▍       | 17/70 [00:12<00:38,  1.37it/s]

1.425509148173862


 26%|██▌       | 18/70 [00:13<00:37,  1.39it/s]

1.3690889941321478


 27%|██▋       | 19/70 [00:13<00:36,  1.41it/s]

1.3287771609094408


 29%|██▊       | 20/70 [00:14<00:35,  1.39it/s]

1.2755889627668593


 30%|███       | 21/70 [00:15<00:35,  1.39it/s]

1.2374625868267484


 31%|███▏      | 22/70 [00:15<00:34,  1.38it/s]

1.1941516862975226


 33%|███▎      | 23/70 [00:16<00:33,  1.38it/s]

1.1563981506559584


 34%|███▍      | 24/70 [00:17<00:33,  1.38it/s]

1.1093392107221816


 36%|███▌      | 25/70 [00:18<00:32,  1.38it/s]

1.0798974765671625


 37%|███▋      | 26/70 [00:18<00:31,  1.38it/s]

1.0491969452963934


 39%|███▊      | 27/70 [00:19<00:31,  1.38it/s]

1.0149992836846247


 40%|████      | 28/70 [00:20<00:30,  1.36it/s]

0.9737599690755209


 41%|████▏     | 29/70 [00:21<00:30,  1.34it/s]

0.9393781158659193


 43%|████▎     | 30/70 [00:21<00:29,  1.36it/s]

0.9123065736558702


 44%|████▍     | 31/70 [00:22<00:28,  1.37it/s]

0.8909165329403348


 46%|████▌     | 32/70 [00:23<00:27,  1.37it/s]

0.8665881156921387


 47%|████▋     | 33/70 [00:23<00:26,  1.37it/s]

0.8552943269411722


 49%|████▊     | 34/70 [00:24<00:26,  1.38it/s]

0.8422715862592062


 50%|█████     | 35/70 [00:25<00:25,  1.37it/s]

0.8545760909716288


 51%|█████▏    | 36/70 [00:26<00:24,  1.37it/s]

0.8599015408092074


 53%|█████▎    | 37/70 [00:26<00:24,  1.37it/s]

0.8549924029244317


 54%|█████▍    | 38/70 [00:27<00:23,  1.37it/s]

0.8602842887242635


 56%|█████▌    | 39/70 [00:28<00:23,  1.35it/s]

0.8279902537663778


 57%|█████▋    | 40/70 [00:29<00:22,  1.36it/s]

0.7947394384278191


 59%|█████▊    | 41/70 [00:29<00:21,  1.37it/s]

0.7715084320969052


 60%|██████    | 42/70 [00:30<00:20,  1.36it/s]

0.7537093460559845


 61%|██████▏   | 43/70 [00:31<00:19,  1.37it/s]

0.7497714890374078


 63%|██████▎   | 44/70 [00:31<00:18,  1.39it/s]

0.7531726890140109


 64%|██████▍   | 45/70 [00:32<00:19,  1.31it/s]

0.7645450168185763


 66%|██████▌   | 46/70 [00:33<00:19,  1.25it/s]

0.7563341293070052


 67%|██████▋   | 47/70 [00:34<00:19,  1.15it/s]

0.7618637912803226


 69%|██████▊   | 48/70 [00:35<00:18,  1.20it/s]

0.7450981040795644


 70%|███████   | 49/70 [00:36<00:17,  1.23it/s]

0.7087164885467954


 71%|███████▏  | 50/70 [00:37<00:15,  1.25it/s]

0.6785495413674248


 73%|███████▎  | 51/70 [00:37<00:15,  1.25it/s]

0.6575395034419166


 74%|███████▍  | 52/70 [00:38<00:14,  1.28it/s]

0.6505562894874148


 76%|███████▌  | 53/70 [00:39<00:13,  1.29it/s]

0.6354274650414785


 77%|███████▋  | 54/70 [00:40<00:12,  1.31it/s]

0.6346684859858619


 79%|███████▊  | 55/70 [00:40<00:11,  1.31it/s]

0.6384362479050955


 80%|████████  | 56/70 [00:41<00:10,  1.32it/s]

0.6438700490527682


 81%|████████▏ | 57/70 [00:42<00:09,  1.32it/s]

0.6490415963861678


 83%|████████▎ | 58/70 [00:43<00:09,  1.32it/s]

0.6245715320110321


 84%|████████▍ | 59/70 [00:43<00:08,  1.33it/s]

0.6109345753987631


 86%|████████▌ | 60/70 [00:44<00:07,  1.33it/s]

0.5938973757955763


 87%|████████▋ | 61/70 [00:45<00:06,  1.34it/s]

0.5721804036034478


 89%|████████▊ | 62/70 [00:46<00:05,  1.34it/s]

0.5629781319035424


 90%|█████████ | 63/70 [00:46<00:05,  1.36it/s]

0.5603835417164696


 91%|█████████▏| 64/70 [00:47<00:04,  1.38it/s]

0.5598744054635366


 93%|█████████▎| 65/70 [00:48<00:03,  1.34it/s]

0.5670536723401811


 94%|█████████▍| 66/70 [00:49<00:02,  1.36it/s]

0.5648517111937205


 96%|█████████▌| 67/70 [00:49<00:02,  1.37it/s]

0.568293442328771


 97%|█████████▋| 68/70 [00:50<00:01,  1.36it/s]

0.5652727517816756


 99%|█████████▊| 69/70 [00:51<00:00,  1.34it/s]

0.5623669955465529


100%|██████████| 70/70 [00:51<00:00,  1.35it/s]
  0%|          | 0/70 [00:00<?, ?it/s]

0.5672340757317014


  1%|▏         | 1/70 [00:00<00:50,  1.35it/s]

0.5545951757166121


  3%|▎         | 2/70 [00:01<00:50,  1.35it/s]

0.5340148144298129


  4%|▍         | 3/70 [00:02<00:49,  1.37it/s]

0.5150681204266019


  6%|▌         | 4/70 [00:02<00:48,  1.37it/s]

0.4993170566029019


  7%|▋         | 5/70 [00:03<00:47,  1.38it/s]

0.49233546521928573


  9%|▊         | 6/70 [00:04<00:46,  1.37it/s]

0.48581074674924213


 10%|█         | 7/70 [00:05<00:46,  1.37it/s]

0.48226382666163975


 11%|█▏        | 8/70 [00:05<00:45,  1.36it/s]

0.47852812541855705


 13%|█▎        | 9/70 [00:06<00:44,  1.36it/s]

0.47793351941638523


 14%|█▍        | 10/70 [00:07<00:43,  1.37it/s]

0.47620166341463727


 16%|█▌        | 11/70 [00:08<00:43,  1.36it/s]

0.47229518824153477


 17%|█▋        | 12/70 [00:08<00:42,  1.36it/s]

0.4700343343946669


 19%|█▊        | 13/70 [00:09<00:42,  1.35it/s]

0.46839019159475964


 20%|██        | 14/70 [00:10<00:41,  1.34it/s]

0.4696217013729943


 21%|██▏       | 15/70 [00:11<00:40,  1.35it/s]

0.4675629536310832


 23%|██▎       | 16/70 [00:11<00:39,  1.35it/s]

0.46653716762860614


 24%|██▍       | 17/70 [00:12<00:38,  1.37it/s]

0.4671168194876777


 26%|██▌       | 18/70 [00:13<00:37,  1.38it/s]

0.4631207585334778


 27%|██▋       | 19/70 [00:14<00:41,  1.24it/s]

0.46168218221929336


 29%|██▊       | 20/70 [00:14<00:39,  1.27it/s]

0.46183472871780396


 30%|███       | 21/70 [00:15<00:37,  1.30it/s]

0.4617553816901313


 31%|███▏      | 22/70 [00:16<00:36,  1.33it/s]

0.45903437170717454


 33%|███▎      | 23/70 [00:17<00:35,  1.34it/s]

0.45960597693920135


 34%|███▍      | 24/70 [00:17<00:34,  1.34it/s]

0.4582243263721466


 36%|███▌      | 25/70 [00:18<00:33,  1.34it/s]

0.4570271256897185


 37%|███▋      | 26/70 [00:19<00:33,  1.32it/s]

0.4596860359112422


 39%|███▊      | 27/70 [00:20<00:32,  1.33it/s]

0.4545634984970093


 40%|████      | 28/70 [00:20<00:31,  1.32it/s]

0.45638298491636914


 41%|████▏     | 29/70 [00:21<00:31,  1.29it/s]

0.4564903544055091


 43%|████▎     | 30/70 [00:22<00:31,  1.26it/s]

0.4520838078525331


 44%|████▍     | 31/70 [00:23<00:31,  1.26it/s]

0.4521852847602632


 46%|████▌     | 32/70 [00:24<00:29,  1.28it/s]

0.45202350947591996


 47%|████▋     | 33/70 [00:24<00:28,  1.30it/s]

0.45244716935687596


 49%|████▊     | 34/70 [00:25<00:27,  1.31it/s]

0.45187509556611377


 50%|█████     | 35/70 [00:26<00:26,  1.34it/s]

0.4513319234053294


 51%|█████▏    | 36/70 [00:27<00:25,  1.34it/s]

0.45043232209152645


 53%|█████▎    | 37/70 [00:27<00:24,  1.34it/s]

0.44749748044543797


 54%|█████▍    | 38/70 [00:28<00:23,  1.36it/s]

0.448675908976131


 56%|█████▌    | 39/70 [00:29<00:22,  1.37it/s]

0.4496123790740967


 57%|█████▋    | 40/70 [00:29<00:21,  1.36it/s]

0.44579500953356427


 59%|█████▊    | 41/70 [00:30<00:21,  1.35it/s]

0.44912661280896926


 60%|██████    | 42/70 [00:31<00:20,  1.37it/s]

0.4453573442167706


 61%|██████▏   | 43/70 [00:32<00:20,  1.34it/s]

0.44653263356950545


 63%|██████▎   | 44/70 [00:32<00:19,  1.33it/s]

0.44504111508528393


 64%|██████▍   | 45/70 [00:33<00:18,  1.34it/s]

0.44502588775422836


 66%|██████▌   | 46/70 [00:34<00:18,  1.27it/s]

0.44317330916722614


 67%|██████▋   | 47/70 [00:35<00:17,  1.29it/s]

0.4457339462306764


 69%|██████▊   | 48/70 [00:36<00:16,  1.30it/s]

0.4435749418205685


 70%|███████   | 49/70 [00:36<00:15,  1.33it/s]

0.4432525585095088


 71%|███████▏  | 50/70 [00:37<00:14,  1.35it/s]

0.44243913392225903


 73%|███████▎  | 51/70 [00:38<00:14,  1.35it/s]

0.4433464937739902


 74%|███████▍  | 52/70 [00:38<00:13,  1.35it/s]

0.44164568848080105


 76%|███████▌  | 53/70 [00:39<00:12,  1.35it/s]

0.44078387651178574


 77%|███████▋  | 54/70 [00:40<00:13,  1.23it/s]

0.4398960851960712


 79%|███████▊  | 55/70 [00:41<00:11,  1.26it/s]

0.44056272837850785


 80%|████████  | 56/70 [00:42<00:10,  1.30it/s]

0.43976741698053146


 81%|████████▏ | 57/70 [00:42<00:09,  1.32it/s]

0.4380217691262563


 83%|████████▎ | 58/70 [00:43<00:08,  1.34it/s]

0.4382381952471203


 84%|████████▍ | 59/70 [00:44<00:08,  1.35it/s]

0.43828198810418445


 86%|████████▌ | 60/70 [00:45<00:07,  1.35it/s]

0.4395776391029358


 87%|████████▋ | 61/70 [00:45<00:06,  1.37it/s]

0.4389852003918754


 89%|████████▊ | 62/70 [00:46<00:05,  1.36it/s]

0.4382336868180169


 90%|█████████ | 63/70 [00:47<00:05,  1.36it/s]

0.4367447942495346


 91%|█████████▏| 64/70 [00:48<00:04,  1.36it/s]

0.43771740131907994


 93%|█████████▎| 65/70 [00:48<00:03,  1.33it/s]

0.43438829316033256


 94%|█████████▍| 66/70 [00:49<00:02,  1.34it/s]

0.43493883146180046


 96%|█████████▌| 67/70 [00:50<00:02,  1.33it/s]

0.4362320750951767


 97%|█████████▋| 68/70 [00:51<00:01,  1.34it/s]

0.4357975141869651


 99%|█████████▊| 69/70 [00:51<00:00,  1.32it/s]

0.4349168539047241


100%|██████████| 70/70 [00:52<00:00,  1.33it/s]

0.4351106269492043
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [31]:
model_score2

{'Bleu_1': 0.5619034694892766,
 'Bleu_2': 0.4273455743118685,
 'Bleu_3': 0.34568028068686524,
 'Bleu_4': 0.2897783365239755,
 'METEOR': 0.24975352608753387,
 'ROUGE_L': 0.4656078656435284,
 'CIDEr': 1.518974647973671,
 'SPICE': 0.31914643475096466,
 'USC_similarity': 0.5525516242692019}

In [32]:
caption_model3, model_score3 = cross_validation(cv[2][0], cv[2][1], 3)    

Split 3:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 4/8333 [00:00<03:52, 35.84it/s]

preprocessed words 2714 ==> 890
The vocabulary size is 891.
800 out of 891 words are found in the pre-trained matrix.
The size of embedding_matrix is (891, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:28<00:00, 39.97it/s]
  0%|          | 4/2083 [00:00<00:53, 39.01it/s]


Generating set took: 0:03:28.50


100%|██████████| 2083/2083 [00:50<00:00, 41.12it/s]
  0%|          | 0/70 [00:00<?, ?it/s]


Generating set took: 0:00:50.65
Training...


  1%|▏         | 1/70 [00:00<00:54,  1.27it/s]

6.975188573201497


  3%|▎         | 2/70 [00:01<00:55,  1.23it/s]

5.030569500393337


  4%|▍         | 3/70 [00:02<00:55,  1.22it/s]

4.651855521731907


  6%|▌         | 4/70 [00:03<00:54,  1.21it/s]

4.28029923968845


  7%|▋         | 5/70 [00:04<00:53,  1.21it/s]

4.016112883885701


  9%|▊         | 6/70 [00:04<00:52,  1.22it/s]

3.754942602581448


 10%|█         | 7/70 [00:05<00:51,  1.22it/s]

3.4694521162245007


 11%|█▏        | 8/70 [00:06<00:51,  1.21it/s]

3.13319550620185


 13%|█▎        | 9/70 [00:07<00:50,  1.21it/s]

2.8066713015238443


 14%|█▍        | 10/70 [00:08<00:48,  1.24it/s]

2.546067714691162


 16%|█▌        | 11/70 [00:08<00:45,  1.28it/s]

2.3256487581464977


 17%|█▋        | 12/70 [00:09<00:44,  1.30it/s]

2.148143026563856


 19%|█▊        | 13/70 [00:10<00:43,  1.32it/s]

2.0038982894685535


 20%|██        | 14/70 [00:11<00:41,  1.34it/s]

1.8933968544006348


 21%|██▏       | 15/70 [00:11<00:40,  1.35it/s]

1.7769571940104167


 23%|██▎       | 16/70 [00:12<00:39,  1.37it/s]

1.669872482617696


 24%|██▍       | 17/70 [00:13<00:38,  1.37it/s]

1.5808513826794095


 26%|██▌       | 18/70 [00:14<00:37,  1.38it/s]

1.513142122162713


 27%|██▋       | 19/70 [00:14<00:36,  1.39it/s]

1.4649714099036322


 29%|██▊       | 20/70 [00:15<00:36,  1.39it/s]

1.410304652320014


 30%|███       | 21/70 [00:16<00:35,  1.38it/s]

1.347033182779948


 31%|███▏      | 22/70 [00:16<00:34,  1.38it/s]

1.2930637730492487


 33%|███▎      | 23/70 [00:17<00:34,  1.38it/s]

1.240667912695143


 34%|███▍      | 24/70 [00:18<00:33,  1.36it/s]

1.2019943992296855


 36%|███▌      | 25/70 [00:19<00:33,  1.36it/s]

1.1543224652608235


 37%|███▋      | 26/70 [00:19<00:32,  1.36it/s]

1.1212096412976582


 39%|███▊      | 27/70 [00:20<00:31,  1.36it/s]

1.084716022014618


 40%|████      | 28/70 [00:21<00:30,  1.36it/s]

1.0642272101508246


 41%|████▏     | 29/70 [00:22<00:29,  1.37it/s]

1.0494991077317133


 43%|████▎     | 30/70 [00:22<00:28,  1.38it/s]

1.0212988191180759


 44%|████▍     | 31/70 [00:23<00:28,  1.38it/s]

1.0027952525350783


 46%|████▌     | 32/70 [00:24<00:27,  1.37it/s]

0.9685573048061795


 47%|████▋     | 33/70 [00:24<00:27,  1.37it/s]

0.9421453211042616


 49%|████▊     | 34/70 [00:25<00:26,  1.36it/s]

0.9273452162742615


 50%|█████     | 35/70 [00:26<00:26,  1.30it/s]

0.9145781199137369


 51%|█████▏    | 36/70 [00:27<00:27,  1.25it/s]

0.9019578165478177


 53%|█████▎    | 37/70 [00:28<00:26,  1.24it/s]

0.8768392470147874


 54%|█████▍    | 38/70 [00:29<00:26,  1.22it/s]

0.848003089427948


 56%|█████▌    | 39/70 [00:29<00:25,  1.19it/s]

0.829284217622545


 57%|█████▋    | 40/70 [00:30<00:25,  1.16it/s]

0.8160483572218153


 59%|█████▊    | 41/70 [00:31<00:25,  1.15it/s]

0.8171826667255826


 60%|██████    | 42/70 [00:32<00:24,  1.14it/s]

0.8119888106981913


 61%|██████▏   | 43/70 [00:33<00:23,  1.14it/s]

0.798962394396464


 63%|██████▎   | 44/70 [00:34<00:22,  1.14it/s]

0.780378497309155


 64%|██████▍   | 45/70 [00:35<00:21,  1.16it/s]

0.7792359987894694


 66%|██████▌   | 46/70 [00:36<00:20,  1.15it/s]

0.7675360706117418


 67%|██████▋   | 47/70 [00:36<00:19,  1.16it/s]

0.7638860444227854


 69%|██████▊   | 48/70 [00:37<00:18,  1.16it/s]

0.7404762970076667


 70%|███████   | 49/70 [00:38<00:18,  1.16it/s]

0.7255050440629324


 71%|███████▏  | 50/70 [00:39<00:17,  1.17it/s]

0.7065456012884775


 73%|███████▎  | 51/70 [00:40<00:16,  1.17it/s]

0.6960187455018362


 74%|███████▍  | 52/70 [00:41<00:15,  1.17it/s]

0.6880538165569305


 76%|███████▌  | 53/70 [00:42<00:14,  1.18it/s]

0.689601805475023


 77%|███████▋  | 54/70 [00:42<00:13,  1.19it/s]

0.6994892458120981


 79%|███████▊  | 55/70 [00:43<00:12,  1.20it/s]

0.6811805102560256


 80%|████████  | 56/70 [00:44<00:11,  1.20it/s]

0.6551354030768076


 81%|████████▏ | 57/70 [00:45<00:10,  1.19it/s]

0.6475242310100131


 83%|████████▎ | 58/70 [00:46<00:10,  1.19it/s]

0.6415523555543687


 84%|████████▍ | 59/70 [00:47<00:09,  1.19it/s]

0.6318620608912574


 86%|████████▌ | 60/70 [00:47<00:08,  1.18it/s]

0.621227029297087


 87%|████████▋ | 61/70 [00:48<00:07,  1.17it/s]

0.6253009339173635


 89%|████████▊ | 62/70 [00:49<00:06,  1.17it/s]

0.6177807847658793


 90%|█████████ | 63/70 [00:50<00:05,  1.17it/s]

0.6100322902202606


 91%|█████████▏| 64/70 [00:51<00:04,  1.21it/s]

0.5954813758532206


 93%|█████████▎| 65/70 [00:52<00:04,  1.24it/s]

0.5753992001215616


 94%|█████████▍| 66/70 [00:52<00:03,  1.26it/s]

0.5692541731728448


 96%|█████████▌| 67/70 [00:53<00:02,  1.28it/s]

0.5569381150934432


 97%|█████████▋| 68/70 [00:54<00:01,  1.30it/s]

0.5564520789517297


 99%|█████████▊| 69/70 [00:55<00:00,  1.32it/s]

0.5750736097494761


100%|██████████| 70/70 [00:55<00:00,  1.25it/s]
  0%|          | 0/70 [00:00<?, ?it/s]

0.589686801036199


  1%|▏         | 1/70 [00:00<00:50,  1.37it/s]

0.5884547697173225


  3%|▎         | 2/70 [00:01<00:50,  1.35it/s]

0.5605697532494863


  4%|▍         | 3/70 [00:02<00:49,  1.35it/s]

0.5396430657969581


  6%|▌         | 4/70 [00:02<00:48,  1.35it/s]

0.5229384866025713


  7%|▋         | 5/70 [00:03<00:48,  1.35it/s]

0.5129800935586294


  9%|▊         | 6/70 [00:04<00:47,  1.34it/s]

0.5070742302470737


 10%|█         | 7/70 [00:05<00:46,  1.36it/s]

0.5014357268810272


 11%|█▏        | 8/70 [00:05<00:45,  1.36it/s]

0.4990273912747701


 13%|█▎        | 9/70 [00:06<00:44,  1.37it/s]

0.4958456688457065


 14%|█▍        | 10/70 [00:07<00:43,  1.37it/s]

0.4928583734565311


 16%|█▌        | 11/70 [00:08<00:42,  1.38it/s]

0.48931248320473564


 17%|█▋        | 12/70 [00:09<00:47,  1.21it/s]

0.48986926012569004


 19%|█▊        | 13/70 [00:09<00:45,  1.25it/s]

0.48577806022432113


 20%|██        | 14/70 [00:10<00:44,  1.27it/s]

0.48567002680566573


 21%|██▏       | 15/70 [00:11<00:42,  1.30it/s]

0.4842343595292833


 23%|██▎       | 16/70 [00:12<00:42,  1.26it/s]

0.4818864729669359


 24%|██▍       | 17/70 [00:13<00:42,  1.24it/s]

0.4799417091740502


 26%|██▌       | 18/70 [00:13<00:42,  1.23it/s]

0.4822462730937534


 27%|██▋       | 19/70 [00:14<00:41,  1.22it/s]

0.47742752234141034


 29%|██▊       | 20/70 [00:15<00:41,  1.20it/s]

0.4796133836110433


 30%|███       | 21/70 [00:16<00:40,  1.20it/s]

0.47631030281384784


 31%|███▏      | 22/70 [00:17<00:39,  1.21it/s]

0.4725409944852193


 33%|███▎      | 23/70 [00:18<00:39,  1.20it/s]

0.4733426570892334


 34%|███▍      | 24/70 [00:18<00:38,  1.20it/s]

0.4748525619506836


 36%|███▌      | 25/70 [00:19<00:37,  1.21it/s]

0.47091439366340637


 37%|███▋      | 26/70 [00:20<00:36,  1.20it/s]

0.47073232796457076


 39%|███▊      | 27/70 [00:21<00:35,  1.20it/s]

0.47142523858282304


 40%|████      | 28/70 [00:22<00:35,  1.20it/s]

0.4712348547246721


 41%|████▏     | 29/70 [00:23<00:35,  1.17it/s]

0.4692630138662126


 43%|████▎     | 30/70 [00:23<00:32,  1.21it/s]

0.4678678462902705


 44%|████▍     | 31/70 [00:24<00:30,  1.26it/s]

0.4681423306465149


 46%|████▌     | 32/70 [00:25<00:29,  1.29it/s]

0.46854952971140545


 47%|████▋     | 33/70 [00:26<00:28,  1.30it/s]

0.46525097224447465


 49%|████▊     | 34/70 [00:26<00:27,  1.33it/s]

0.4674375421471066


 50%|█████     | 35/70 [00:27<00:26,  1.32it/s]

0.46638838781250846


 51%|█████▏    | 36/70 [00:28<00:25,  1.34it/s]

0.4648946291870541


 53%|█████▎    | 37/70 [00:29<00:24,  1.33it/s]

0.46456192930539447


 54%|█████▍    | 38/70 [00:29<00:25,  1.28it/s]

0.4633864528603024


 56%|█████▌    | 39/70 [00:30<00:25,  1.24it/s]

0.46362601386176217


 57%|█████▋    | 40/70 [00:31<00:24,  1.23it/s]

0.46504943900638157


 59%|█████▊    | 41/70 [00:32<00:23,  1.22it/s]

0.46239223413997227


 60%|██████    | 42/70 [00:33<00:23,  1.21it/s]

0.4612979640563329


 61%|██████▏   | 43/70 [00:34<00:21,  1.23it/s]

0.4630233546098073


 63%|██████▎   | 44/70 [00:34<00:20,  1.26it/s]

0.4620046185122596


 64%|██████▍   | 45/70 [00:35<00:19,  1.29it/s]

0.4594910426272286


 66%|██████▌   | 46/70 [00:36<00:18,  1.29it/s]

0.45965754157967037


 67%|██████▋   | 47/70 [00:37<00:17,  1.32it/s]

0.45882368750042385


 69%|██████▊   | 48/70 [00:37<00:16,  1.32it/s]

0.45797232952382827


 70%|███████   | 49/70 [00:38<00:15,  1.32it/s]

0.45647580093807644


 71%|███████▏  | 50/70 [00:39<00:15,  1.31it/s]

0.45679278009467655


 73%|███████▎  | 51/70 [00:40<00:14,  1.32it/s]

0.45628520515229964


 74%|███████▍  | 52/70 [00:40<00:13,  1.31it/s]

0.4541491170724233


 76%|███████▌  | 53/70 [00:41<00:13,  1.30it/s]

0.45487138960096574


 77%|███████▋  | 54/70 [00:42<00:12,  1.31it/s]

0.45586491127808887


 79%|███████▊  | 55/70 [00:43<00:11,  1.32it/s]

0.45369704564412433


 80%|████████  | 56/70 [00:43<00:10,  1.32it/s]

0.4549167818493313


 81%|████████▏ | 57/70 [00:44<00:09,  1.33it/s]

0.4533376412259208


 83%|████████▎ | 58/70 [00:45<00:09,  1.33it/s]

0.45354745123121476


 84%|████████▍ | 59/70 [00:46<00:08,  1.35it/s]

0.4543169488509496


 86%|████████▌ | 60/70 [00:46<00:07,  1.34it/s]

0.45240504211849636


 87%|████████▋ | 61/70 [00:47<00:06,  1.34it/s]

0.4496520360310872


 89%|████████▊ | 62/70 [00:48<00:05,  1.34it/s]

0.4493604269292619


 90%|█████████ | 63/70 [00:49<00:05,  1.35it/s]

0.44895512859026593


 91%|█████████▏| 64/70 [00:49<00:04,  1.34it/s]

0.4510026044315762


 93%|█████████▎| 65/70 [00:50<00:03,  1.33it/s]

0.450659175713857


 94%|█████████▍| 66/70 [00:51<00:03,  1.33it/s]

0.4504403273264567


 96%|█████████▌| 67/70 [00:52<00:02,  1.34it/s]

0.4504547119140625


 97%|█████████▋| 68/70 [00:52<00:01,  1.34it/s]

0.4474717560741637


 99%|█████████▊| 69/70 [00:53<00:00,  1.20it/s]

0.4500509119696087


100%|██████████| 70/70 [00:54<00:00,  1.28it/s]

0.44794706172413296
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [33]:
model_score3

{'Bleu_1': 0.5703853046594727,
 'Bleu_2': 0.4375965298378617,
 'Bleu_3': 0.3551839275213377,
 'Bleu_4': 0.29884445857883435,
 'METEOR': 0.2600124270983407,
 'ROUGE_L': 0.4841837633138598,
 'CIDEr': 1.664026925119711,
 'SPICE': 0.3330616530723322,
 'USC_similarity': 0.561963636121045}

In [34]:
caption_model4, model_score4 = cross_validation(cv[3][0], cv[3][1], 4)    

Split 4:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 4/8333 [00:00<03:46, 36.80it/s]

preprocessed words 2680 ==> 894
The vocabulary size is 895.
806 out of 895 words are found in the pre-trained matrix.
The size of embedding_matrix is (895, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:21<00:00, 41.27it/s]
  0%|          | 5/2083 [00:00<00:49, 41.97it/s]


Generating set took: 0:03:21.91


100%|██████████| 2083/2083 [00:50<00:00, 41.25it/s]
  0%|          | 0/70 [00:00<?, ?it/s]


Generating set took: 0:00:50.50
Training...


  1%|▏         | 1/70 [00:00<00:56,  1.21it/s]

6.477281199561225


  3%|▎         | 2/70 [00:01<00:57,  1.19it/s]

4.66958835389879


  4%|▍         | 3/70 [00:02<00:56,  1.19it/s]

4.461337725321452


  6%|▌         | 4/70 [00:03<00:54,  1.20it/s]

4.074729998906453


  7%|▋         | 5/70 [00:04<00:54,  1.20it/s]

3.65806606080797


  9%|▊         | 6/70 [00:05<00:53,  1.19it/s]

3.2780986891852484


 10%|█         | 7/70 [00:05<00:52,  1.20it/s]

2.926118082470364


 11%|█▏        | 8/70 [00:06<00:51,  1.19it/s]

2.6242665184868708


 13%|█▎        | 9/70 [00:07<00:51,  1.19it/s]

2.351670265197754


 14%|█▍        | 10/70 [00:08<00:50,  1.18it/s]

2.1352657874425254


 16%|█▌        | 11/70 [00:09<00:50,  1.17it/s]

1.96304370297326


 17%|█▋        | 12/70 [00:10<00:49,  1.16it/s]

1.822984192106459


 19%|█▊        | 13/70 [00:11<00:48,  1.17it/s]

1.6997302373250325


 20%|██        | 14/70 [00:11<00:47,  1.18it/s]

1.6098021268844604


 21%|██▏       | 15/70 [00:12<00:46,  1.18it/s]

1.5299390819337633


 23%|██▎       | 16/70 [00:13<00:45,  1.18it/s]

1.4347154961691961


 24%|██▍       | 17/70 [00:14<00:44,  1.18it/s]

1.3543651898701985


 26%|██▌       | 18/70 [00:15<00:44,  1.17it/s]

1.2798384295569525


 27%|██▋       | 19/70 [00:16<00:43,  1.18it/s]

1.231212920612759


 29%|██▊       | 20/70 [00:16<00:42,  1.18it/s]

1.1714321109983656


 30%|███       | 21/70 [00:17<00:41,  1.18it/s]

1.1218740608957078


 31%|███▏      | 22/70 [00:18<00:40,  1.20it/s]

1.0837717056274414


 33%|███▎      | 23/70 [00:19<00:38,  1.21it/s]

1.0514464577039082


 34%|███▍      | 24/70 [00:20<00:38,  1.20it/s]

1.0298961798350017


 36%|███▌      | 25/70 [00:21<00:37,  1.20it/s]

0.9944569932089912


 37%|███▋      | 26/70 [00:21<00:36,  1.20it/s]

0.9614509012964036


 39%|███▊      | 27/70 [00:22<00:35,  1.20it/s]

0.9295752975675795


 40%|████      | 28/70 [00:23<00:34,  1.20it/s]

0.8978407714102004


 41%|████▏     | 29/70 [00:24<00:33,  1.21it/s]

0.8758599294556512


 43%|████▎     | 30/70 [00:25<00:33,  1.21it/s]

0.8531602091259427


 44%|████▍     | 31/70 [00:26<00:32,  1.21it/s]

0.8360315892431471


 46%|████▌     | 32/70 [00:26<00:31,  1.22it/s]

0.8125905394554138


 47%|████▋     | 33/70 [00:27<00:29,  1.24it/s]

0.7866352597872416


 49%|████▊     | 34/70 [00:28<00:28,  1.28it/s]

0.7763969831996493


 50%|█████     | 35/70 [00:29<00:27,  1.29it/s]

0.7607469591829512


 51%|█████▏    | 36/70 [00:29<00:25,  1.31it/s]

0.7572694023450216


 53%|█████▎    | 37/70 [00:30<00:24,  1.33it/s]

0.7533601654900445


 54%|█████▍    | 38/70 [00:31<00:25,  1.28it/s]

0.728565494219462


 56%|█████▌    | 39/70 [00:32<00:24,  1.24it/s]

0.7198717296123505


 57%|█████▋    | 40/70 [00:33<00:24,  1.23it/s]

0.7068492968877157


 59%|█████▊    | 41/70 [00:34<00:24,  1.19it/s]

0.7254827287462022


 60%|██████    | 42/70 [00:34<00:23,  1.17it/s]

0.6893267962667677


 61%|██████▏   | 43/70 [00:35<00:23,  1.17it/s]

0.6686415639188554


 63%|██████▎   | 44/70 [00:36<00:24,  1.04it/s]

0.6564028196864657


 64%|██████▍   | 45/70 [00:37<00:22,  1.12it/s]

0.6506538622909122


 66%|██████▌   | 46/70 [00:38<00:20,  1.19it/s]

0.6540800796614753


 67%|██████▋   | 47/70 [00:39<00:18,  1.24it/s]

0.6360844572385153


 69%|██████▊   | 48/70 [00:39<00:17,  1.27it/s]

0.6257755127218034


 70%|███████   | 49/70 [00:40<00:16,  1.30it/s]

0.6303426457775964


 71%|███████▏  | 50/70 [00:41<00:15,  1.33it/s]

0.6257703834109836


 73%|███████▎  | 51/70 [00:42<00:14,  1.33it/s]

0.6172867947154574


 74%|███████▍  | 52/70 [00:42<00:13,  1.35it/s]

0.6173058350880941


 76%|███████▌  | 53/70 [00:43<00:12,  1.36it/s]

0.6136625011761984


 77%|███████▋  | 54/70 [00:44<00:11,  1.37it/s]

0.596932484043969


 79%|███████▊  | 55/70 [00:44<00:10,  1.37it/s]

0.5792014929983351


 80%|████████  | 56/70 [00:45<00:10,  1.36it/s]

0.5859838128089905


 81%|████████▏ | 57/70 [00:46<00:09,  1.36it/s]

0.5912266506089104


 83%|████████▎ | 58/70 [00:47<00:08,  1.35it/s]

0.571879463063346


 84%|████████▍ | 59/70 [00:47<00:08,  1.35it/s]

0.5612954894701639


 86%|████████▌ | 60/70 [00:48<00:07,  1.35it/s]

0.5522996650801765


 87%|████████▋ | 61/70 [00:49<00:06,  1.36it/s]

0.5368847813871171


 89%|████████▊ | 62/70 [00:50<00:05,  1.36it/s]

0.5170839428901672


 90%|█████████ | 63/70 [00:50<00:05,  1.33it/s]

0.5192706750498878


 91%|█████████▏| 64/70 [00:51<00:04,  1.34it/s]

0.5133733981185489


 93%|█████████▎| 65/70 [00:52<00:03,  1.32it/s]

0.5099682244989607


 94%|█████████▍| 66/70 [00:53<00:03,  1.27it/s]

0.5105262266265022


 96%|█████████▌| 67/70 [00:54<00:02,  1.23it/s]

0.4967234234015147


 97%|█████████▋| 68/70 [00:55<00:01,  1.22it/s]

0.48367228110631305


 99%|█████████▊| 69/70 [00:55<00:00,  1.22it/s]

0.46515635318226284


100%|██████████| 70/70 [00:56<00:00,  1.24it/s]
  0%|          | 0/70 [00:00<?, ?it/s]

0.4625831875536177


  1%|▏         | 1/70 [00:00<00:59,  1.17it/s]

0.4516700539324019


  3%|▎         | 2/70 [00:01<00:57,  1.17it/s]

0.44124309718608856


  4%|▍         | 3/70 [00:02<00:56,  1.18it/s]

0.4321264839834637


  6%|▌         | 4/70 [00:03<00:55,  1.18it/s]

0.4234932280249066


  7%|▋         | 5/70 [00:04<00:55,  1.17it/s]

0.41954772505495286


  9%|▊         | 6/70 [00:05<00:55,  1.16it/s]

0.41950522197617424


 10%|█         | 7/70 [00:05<00:54,  1.17it/s]

0.4146220601267285


 11%|█▏        | 8/70 [00:06<00:52,  1.17it/s]

0.4131338993708293


 13%|█▎        | 9/70 [00:07<00:51,  1.19it/s]

0.40849339299731785


 14%|█▍        | 10/70 [00:08<00:51,  1.17it/s]

0.4085263858238856


 16%|█▌        | 11/70 [00:09<00:49,  1.18it/s]

0.40743467377291787


 17%|█▋        | 12/70 [00:10<00:49,  1.17it/s]

0.40605313579241437


 19%|█▊        | 13/70 [00:11<00:49,  1.15it/s]

0.40541859964529675


 20%|██        | 14/70 [00:11<00:47,  1.19it/s]

0.40599556598398423


 21%|██▏       | 15/70 [00:12<00:44,  1.24it/s]

0.4021446290943358


 23%|██▎       | 16/70 [00:13<00:42,  1.27it/s]

0.4037004742357466


 24%|██▍       | 17/70 [00:14<00:40,  1.31it/s]

0.40098517802026534


 26%|██▌       | 18/70 [00:14<00:38,  1.34it/s]

0.39903075496355694


 27%|██▋       | 19/70 [00:15<00:37,  1.35it/s]

0.39910387496153515


 29%|██▊       | 20/70 [00:16<00:36,  1.36it/s]

0.39866071608331466


 30%|███       | 21/70 [00:16<00:35,  1.37it/s]

0.3994673109716839


 31%|███▏      | 22/70 [00:17<00:35,  1.36it/s]

0.39726430508825517


 33%|███▎      | 23/70 [00:18<00:34,  1.36it/s]

0.3979467517799801


 34%|███▍      | 24/70 [00:19<00:33,  1.37it/s]

0.39778899153073627


 36%|███▌      | 25/70 [00:19<00:32,  1.37it/s]

0.39542681806617314


 37%|███▋      | 26/70 [00:20<00:31,  1.39it/s]

0.39640479783217114


 39%|███▊      | 27/70 [00:21<00:31,  1.37it/s]

0.3943109412988027


 40%|████      | 28/70 [00:22<00:30,  1.38it/s]

0.39435770776536727


 41%|████▏     | 29/70 [00:22<00:29,  1.38it/s]

0.3945995834138658


 43%|████▎     | 30/70 [00:23<00:29,  1.38it/s]

0.39132412605815464


 44%|████▍     | 31/70 [00:24<00:28,  1.39it/s]

0.39337324599425


 46%|████▌     | 32/70 [00:24<00:27,  1.39it/s]

0.39270060261090595


 47%|████▋     | 33/70 [00:25<00:26,  1.39it/s]

0.393835476703114


 49%|████▊     | 34/70 [00:26<00:26,  1.38it/s]

0.392886146903038


 50%|█████     | 35/70 [00:27<00:25,  1.37it/s]

0.3897714051935408


 51%|█████▏    | 36/70 [00:27<00:24,  1.37it/s]

0.3902670558955934


 53%|█████▎    | 37/70 [00:28<00:24,  1.37it/s]

0.3915167699257533


 54%|█████▍    | 38/70 [00:29<00:23,  1.37it/s]

0.39170022971100277


 56%|█████▌    | 39/70 [00:30<00:22,  1.38it/s]

0.3885891768667433


 57%|█████▋    | 40/70 [00:30<00:21,  1.37it/s]

0.38747650053766036


 59%|█████▊    | 41/70 [00:31<00:21,  1.37it/s]

0.3886298007435269


 60%|██████    | 42/70 [00:32<00:20,  1.37it/s]

0.38901952074633706


 61%|██████▏   | 43/70 [00:32<00:19,  1.37it/s]

0.38811126682493424


 63%|██████▎   | 44/70 [00:33<00:18,  1.38it/s]

0.38647430307335323


 64%|██████▍   | 45/70 [00:34<00:20,  1.20it/s]

0.3874223166041904


 66%|██████▌   | 46/70 [00:35<00:18,  1.26it/s]

0.3876214010847939


 67%|██████▋   | 47/70 [00:36<00:17,  1.29it/s]

0.3848078813817766


 69%|██████▊   | 48/70 [00:36<00:16,  1.33it/s]

0.3855394141541587


 70%|███████   | 49/70 [00:37<00:15,  1.34it/s]

0.3864568488465415


 71%|███████▏  | 50/70 [00:38<00:14,  1.36it/s]

0.3854215294122696


 73%|███████▎  | 51/70 [00:39<00:13,  1.37it/s]

0.3861673954460356


 74%|███████▍  | 52/70 [00:39<00:13,  1.37it/s]

0.3847578681177563


 76%|███████▌  | 53/70 [00:40<00:12,  1.37it/s]

0.3837828007009294


 77%|███████▋  | 54/70 [00:41<00:11,  1.37it/s]

0.38219143284691703


 79%|███████▊  | 55/70 [00:42<00:11,  1.30it/s]

0.3823770334323247


 80%|████████  | 56/70 [00:42<00:11,  1.27it/s]

0.3809751388099458


 81%|████████▏ | 57/70 [00:43<00:10,  1.24it/s]

0.3839668383200963


 83%|████████▎ | 58/70 [00:44<00:10,  1.20it/s]

0.3827338715394338


 84%|████████▍ | 59/70 [00:45<00:09,  1.19it/s]

0.38275642196337384


 86%|████████▌ | 60/70 [00:46<00:08,  1.19it/s]

0.3818288180563185


 87%|████████▋ | 61/70 [00:47<00:07,  1.19it/s]

0.38206502464082504


 89%|████████▊ | 62/70 [00:48<00:06,  1.20it/s]

0.3823491351472007


 90%|█████████ | 63/70 [00:48<00:05,  1.20it/s]

0.3820478038655387


 91%|█████████▏| 64/70 [00:49<00:05,  1.19it/s]

0.38162628147337174


 93%|█████████▎| 65/70 [00:50<00:04,  1.14it/s]

0.38049710128042435


 94%|█████████▍| 66/70 [00:51<00:03,  1.17it/s]

0.3789464301533169


 96%|█████████▌| 67/70 [00:52<00:02,  1.23it/s]

0.3795972764492035


 97%|█████████▋| 68/70 [00:52<00:01,  1.27it/s]

0.3798184742530187


 99%|█████████▊| 69/70 [00:53<00:00,  1.30it/s]

0.37852104836040074


100%|██████████| 70/70 [00:54<00:00,  1.29it/s]

0.3805327167113622
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [35]:
model_score4

{'Bleu_1': 0.558752364612179,
 'Bleu_2': 0.4273882495608079,
 'Bleu_3': 0.3464934746358031,
 'Bleu_4': 0.290893255209788,
 'METEOR': 0.258355399398307,
 'ROUGE_L': 0.4800528014335929,
 'CIDEr': 1.5766341092350828,
 'SPICE': 0.3275709420543527,
 'USC_similarity': 0.553531852808785}

In [36]:
caption_model5, model_score5 = cross_validation(cv[4][0], cv[4][1], 5)    

Split 5:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 4/8333 [00:00<04:00, 34.66it/s]

preprocessed words 2657 ==> 905
The vocabulary size is 906.
815 out of 906 words are found in the pre-trained matrix.
The size of embedding_matrix is (906, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [03:21<00:00, 41.33it/s]
  0%|          | 5/2083 [00:00<00:49, 41.83it/s]


Generating set took: 0:03:21.61


100%|██████████| 2083/2083 [00:50<00:00, 41.38it/s]
  0%|          | 0/70 [00:00<?, ?it/s]


Generating set took: 0:00:50.34
Training...


  1%|▏         | 1/70 [00:00<00:50,  1.37it/s]

6.2040150960286455


  3%|▎         | 2/70 [00:01<00:49,  1.37it/s]

4.833920796712239


  4%|▍         | 3/70 [00:02<00:48,  1.37it/s]

4.337032794952393


  6%|▌         | 4/70 [00:02<00:48,  1.37it/s]

3.7826893859439426


  7%|▋         | 5/70 [00:03<00:47,  1.38it/s]

3.2374730110168457


  9%|▊         | 6/70 [00:04<00:46,  1.38it/s]

2.8586499161190457


 10%|█         | 7/70 [00:05<00:45,  1.39it/s]

2.5533256000942655


 11%|█▏        | 8/70 [00:05<00:44,  1.39it/s]

2.2848768764071994


 13%|█▎        | 9/70 [00:06<00:43,  1.39it/s]

2.0772148105833264


 14%|█▍        | 10/70 [00:07<00:43,  1.38it/s]

1.9078042109807332


 16%|█▌        | 11/70 [00:07<00:42,  1.38it/s]

1.7473227447933621


 17%|█▋        | 12/70 [00:08<00:42,  1.38it/s]

1.6175494061575995


 19%|█▊        | 13/70 [00:09<00:41,  1.38it/s]

1.5209975507524278


 20%|██        | 14/70 [00:10<00:40,  1.38it/s]

1.4423127174377441


 21%|██▏       | 15/70 [00:10<00:39,  1.38it/s]

1.3567830589082506


 23%|██▎       | 16/70 [00:11<00:39,  1.38it/s]

1.3179166979259915


 24%|██▍       | 17/70 [00:12<00:38,  1.38it/s]

1.2375210457377963


 26%|██▌       | 18/70 [00:13<00:45,  1.15it/s]

1.1655236879984539


 27%|██▋       | 19/70 [00:14<00:42,  1.21it/s]

1.1191964149475098


 29%|██▊       | 20/70 [00:14<00:39,  1.25it/s]

1.054874214861128


 30%|███       | 21/70 [00:15<00:38,  1.28it/s]

1.015972865952386


 31%|███▏      | 22/70 [00:16<00:36,  1.32it/s]

0.9839590986569723


 33%|███▎      | 23/70 [00:17<00:35,  1.33it/s]

0.9671076734860738


 34%|███▍      | 24/70 [00:17<00:34,  1.35it/s]

0.9437166187498305


 36%|███▌      | 25/70 [00:18<00:33,  1.35it/s]

0.917197479142083


 37%|███▋      | 26/70 [00:19<00:32,  1.35it/s]

0.8759358723958334


 39%|███▊      | 27/70 [00:20<00:32,  1.34it/s]

0.8498010767830743


 40%|████      | 28/70 [00:20<00:31,  1.35it/s]

0.8351727525393168


 41%|████▏     | 29/70 [00:21<00:29,  1.37it/s]

0.812548041343689


 43%|████▎     | 30/70 [00:22<00:29,  1.36it/s]

0.7770011557473077


 44%|████▍     | 31/70 [00:23<00:29,  1.34it/s]

0.7469434440135956


 46%|████▌     | 32/70 [00:23<00:27,  1.36it/s]

0.7229210701253679


 47%|████▋     | 33/70 [00:24<00:27,  1.36it/s]

0.7126219305727217


 49%|████▊     | 34/70 [00:25<00:26,  1.37it/s]

0.6948916978306241


 50%|█████     | 35/70 [00:25<00:25,  1.35it/s]

0.6908977561526828


 51%|█████▏    | 36/70 [00:26<00:25,  1.31it/s]

0.682934625281228


 53%|█████▎    | 37/70 [00:27<00:25,  1.27it/s]

0.6763830747869279


 54%|█████▍    | 38/70 [00:28<00:25,  1.26it/s]

0.6696588397026062


 56%|█████▌    | 39/70 [00:29<00:24,  1.26it/s]

0.6513053708606296


 57%|█████▋    | 40/70 [00:30<00:24,  1.25it/s]

0.6427034404542711


 59%|█████▊    | 41/70 [00:30<00:23,  1.24it/s]

0.658693876531389


 60%|██████    | 42/70 [00:31<00:22,  1.23it/s]

0.6694508956538306


 61%|██████▏   | 43/70 [00:32<00:22,  1.21it/s]

0.6796669363975525


 63%|██████▎   | 44/70 [00:33<00:21,  1.20it/s]

0.6526889635456933


 64%|██████▍   | 45/70 [00:34<00:20,  1.20it/s]

0.6415456831455231


 66%|██████▌   | 46/70 [00:35<00:20,  1.19it/s]

0.633988618850708


 67%|██████▋   | 47/70 [00:35<00:19,  1.20it/s]

0.6417454249329038


 69%|██████▊   | 48/70 [00:36<00:18,  1.21it/s]

0.6130623386965858


 70%|███████   | 49/70 [00:37<00:17,  1.21it/s]

0.5791964928309122


 71%|███████▏  | 50/70 [00:38<00:16,  1.21it/s]

0.5555507838726044


 73%|███████▎  | 51/70 [00:39<00:15,  1.22it/s]

0.5393195350964864


 74%|███████▍  | 52/70 [00:39<00:14,  1.25it/s]

0.5385702351729075


 76%|███████▌  | 53/70 [00:40<00:13,  1.27it/s]

0.5412505500846438


 77%|███████▋  | 54/70 [00:41<00:12,  1.29it/s]

0.5473480721314748


 79%|███████▊  | 55/70 [00:42<00:11,  1.32it/s]

0.54570957687166


 80%|████████  | 56/70 [00:42<00:10,  1.33it/s]

0.5262255933549669


 81%|████████▏ | 57/70 [00:43<00:09,  1.33it/s]

0.5149770478407542


 83%|████████▎ | 58/70 [00:44<00:09,  1.33it/s]

0.5014650556776259


 84%|████████▍ | 59/70 [00:45<00:08,  1.33it/s]

0.4894757999314202


 86%|████████▌ | 60/70 [00:45<00:07,  1.33it/s]

0.4875323673089345


 87%|████████▋ | 61/70 [00:46<00:06,  1.33it/s]

0.48681361145443386


 89%|████████▊ | 62/70 [00:47<00:06,  1.33it/s]

0.479834861225552


 90%|█████████ | 63/70 [00:48<00:05,  1.33it/s]

0.4708118736743927


 91%|█████████▏| 64/70 [00:48<00:04,  1.33it/s]

0.4677945607238346


 93%|█████████▎| 65/70 [00:49<00:03,  1.33it/s]

0.4650390280617608


 94%|█████████▍| 66/70 [00:50<00:03,  1.33it/s]

0.4483097228738997


 96%|█████████▌| 67/70 [00:51<00:02,  1.35it/s]

0.4411499434047275


 97%|█████████▋| 68/70 [00:51<00:01,  1.36it/s]

0.42758140630192226


 99%|█████████▊| 69/70 [00:52<00:00,  1.36it/s]

0.4183428982893626


100%|██████████| 70/70 [00:53<00:00,  1.31it/s]
  0%|          | 0/70 [00:00<?, ?it/s]

0.4088704569472207


  1%|▏         | 1/70 [00:00<00:53,  1.30it/s]

0.39334385262595284


  3%|▎         | 2/70 [00:01<00:52,  1.29it/s]

0.38795631792810226


  4%|▍         | 3/70 [00:02<00:51,  1.31it/s]

0.3797433177630107


  6%|▌         | 4/70 [00:03<00:50,  1.32it/s]

0.3764478961626689


  7%|▋         | 5/70 [00:03<00:49,  1.32it/s]

0.3723942389090856


  9%|▊         | 6/70 [00:04<00:49,  1.30it/s]

0.3724220014280743


 10%|█         | 7/70 [00:05<00:49,  1.26it/s]

0.36805915501382613


 11%|█▏        | 8/70 [00:06<00:50,  1.23it/s]

0.36783203813764787


 13%|█▎        | 9/70 [00:07<00:49,  1.22it/s]

0.36456314225991565


 14%|█▍        | 10/70 [00:07<00:49,  1.21it/s]

0.3638768345117569


 16%|█▌        | 11/70 [00:08<00:49,  1.20it/s]

0.3633192777633667


 17%|█▋        | 12/70 [00:09<00:48,  1.19it/s]

0.3632366011540095


 19%|█▊        | 13/70 [00:10<00:48,  1.19it/s]

0.36095163226127625


 20%|██        | 14/70 [00:11<00:47,  1.19it/s]

0.3607785536183251


 21%|██▏       | 15/70 [00:12<00:45,  1.20it/s]

0.35945892168415916


 23%|██▎       | 16/70 [00:12<00:44,  1.23it/s]

0.35980744494332206


 24%|██▍       | 17/70 [00:13<00:42,  1.25it/s]

0.3582012885146671


 26%|██▌       | 18/70 [00:14<00:47,  1.09it/s]

0.3582020385397805


 27%|██▋       | 19/70 [00:15<00:44,  1.15it/s]

0.35679354435867733


 29%|██▊       | 20/70 [00:16<00:41,  1.20it/s]

0.3542233043246799


 30%|███       | 21/70 [00:17<00:39,  1.23it/s]

0.35518409311771393


 31%|███▏      | 22/70 [00:17<00:38,  1.25it/s]

0.35812195804384017


 33%|███▎      | 23/70 [00:18<00:36,  1.28it/s]

0.35478317075305515


 34%|███▍      | 24/70 [00:19<00:35,  1.30it/s]

0.3551860633823607


 36%|███▌      | 25/70 [00:20<00:34,  1.31it/s]

0.3533464041021135


 37%|███▋      | 26/70 [00:20<00:33,  1.31it/s]

0.3552917821539773


 39%|███▊      | 27/70 [00:21<00:32,  1.32it/s]

0.35386456549167633


 40%|████      | 28/70 [00:22<00:31,  1.32it/s]

0.35445055696699357


 41%|████▏     | 29/70 [00:23<00:31,  1.32it/s]

0.35260841250419617


 43%|████▎     | 30/70 [00:23<00:30,  1.31it/s]

0.352515525288052


 44%|████▍     | 31/70 [00:24<00:30,  1.29it/s]

0.35126659605238175


 46%|████▌     | 32/70 [00:25<00:29,  1.31it/s]

0.3511311726437675


 47%|████▋     | 33/70 [00:26<00:28,  1.30it/s]

0.3499982886844211


 49%|████▊     | 34/70 [00:27<00:27,  1.29it/s]

0.3488519208298789


 50%|█████     | 35/70 [00:27<00:27,  1.29it/s]

0.35038458969857955


 51%|█████▏    | 36/70 [00:28<00:26,  1.30it/s]

0.34958142538865405


 53%|█████▎    | 37/70 [00:29<00:25,  1.31it/s]

0.35157075855467057


 54%|█████▍    | 38/70 [00:30<00:24,  1.31it/s]

0.34971055057313705


 56%|█████▌    | 39/70 [00:30<00:23,  1.31it/s]

0.3494356870651245


 57%|█████▋    | 40/70 [00:31<00:22,  1.31it/s]

0.3496002670791414


 59%|█████▊    | 41/70 [00:32<00:21,  1.33it/s]

0.35044454203711617


 60%|██████    | 42/70 [00:33<00:21,  1.33it/s]

0.3475451601876153


 61%|██████▏   | 43/70 [00:33<00:20,  1.35it/s]

0.3485673765341441


 63%|██████▎   | 44/70 [00:34<00:19,  1.34it/s]

0.34760718378755784


 64%|██████▍   | 45/70 [00:35<00:18,  1.34it/s]

0.3466672930452559


 66%|██████▌   | 46/70 [00:36<00:17,  1.34it/s]

0.345643056763543


 67%|██████▋   | 47/70 [00:36<00:17,  1.35it/s]

0.3471103658278783


 69%|██████▊   | 48/70 [00:37<00:16,  1.36it/s]

0.3467849161889818


 70%|███████   | 49/70 [00:38<00:15,  1.37it/s]

0.3465513437986374


 71%|███████▏  | 50/70 [00:39<00:14,  1.35it/s]

0.34578320052888656


 73%|███████▎  | 51/70 [00:39<00:13,  1.36it/s]

0.344096827838156


 74%|███████▍  | 52/70 [00:40<00:13,  1.35it/s]

0.34440439608361983


 76%|███████▌  | 53/70 [00:41<00:12,  1.35it/s]

0.34324197636710274


 77%|███████▋  | 54/70 [00:42<00:11,  1.34it/s]

0.3447016063663695


 79%|███████▊  | 55/70 [00:42<00:11,  1.33it/s]

0.34486358364423114


 80%|████████  | 56/70 [00:43<00:10,  1.33it/s]

0.3436683714389801


 81%|████████▏ | 57/70 [00:44<00:09,  1.33it/s]

0.34463555614153546


 83%|████████▎ | 58/70 [00:45<00:08,  1.33it/s]

0.34384795857800377


 84%|████████▍ | 59/70 [00:45<00:08,  1.34it/s]

0.34335721532503766


 86%|████████▌ | 60/70 [00:46<00:07,  1.34it/s]

0.3450727164745331


 87%|████████▋ | 61/70 [00:47<00:06,  1.34it/s]

0.3434054106473923


 89%|████████▊ | 62/70 [00:48<00:05,  1.34it/s]

0.3431519882546531


 90%|█████████ | 63/70 [00:48<00:05,  1.34it/s]

0.34176253113481736


 91%|█████████▏| 64/70 [00:49<00:04,  1.34it/s]

0.34245575136608547


 93%|█████████▎| 65/70 [00:50<00:03,  1.34it/s]

0.3406609743833542


 94%|█████████▍| 66/70 [00:50<00:02,  1.35it/s]

0.3421381662289302


 96%|█████████▌| 67/70 [00:51<00:02,  1.35it/s]

0.34202241400877637


 97%|█████████▋| 68/70 [00:52<00:01,  1.35it/s]

0.3416592611206902


 99%|█████████▊| 69/70 [00:53<00:00,  1.35it/s]

0.34234125912189484


100%|██████████| 70/70 [00:53<00:00,  1.30it/s]

0.34015385144286686
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [37]:
model_score5

{'Bleu_1': 0.5577149587750042,
 'Bleu_2': 0.42324342672264254,
 'Bleu_3': 0.34251405439806676,
 'Bleu_4': 0.2862181422837304,
 'METEOR': 0.2462238047452443,
 'ROUGE_L': 0.4649459342312564,
 'CIDEr': 1.5219199940227421,
 'SPICE': 0.3115496001173147,
 'USC_similarity': 0.5410385687746541}

In [38]:
model_scores = defaultdict(list)
for scores in [model_score1, model_score2, model_score3, model_score4, model_score5]:
    for key, value in scores.items():
        model_scores[key].append(value)

In [39]:
model_scores

defaultdict(list,
            {'Bleu_1': [0.5636411749139736,
              0.5619034694892766,
              0.5703853046594727,
              0.558752364612179,
              0.5577149587750042],
             'Bleu_2': [0.4289017507235894,
              0.4273455743118685,
              0.4375965298378617,
              0.4273882495608079,
              0.42324342672264254],
             'Bleu_3': [0.34622082479232946,
              0.34568028068686524,
              0.3551839275213377,
              0.3464934746358031,
              0.34251405439806676],
             'Bleu_4': [0.2899296757283083,
              0.2897783365239755,
              0.29884445857883435,
              0.290893255209788,
              0.2862181422837304],
             'METEOR': [0.2539452072748442,
              0.24975352608753387,
              0.2600124270983407,
              0.258355399398307,
              0.2462238047452443],
             'ROUGE_L': [0.4779333316068181,
              0.4656078656435

In [40]:
tag = '9.3.1'
with open(f'{root_captioning}/fz_notebooks/cv_n{tag}.json', 'w') as fp:
    json.dump(model_scores, fp)