## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../s3"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)



In [7]:
all_paths = train_paths.copy()
all_paths.extend(test_paths.copy())
all_paths = np.array(all_paths)

all_descriptions = train_descriptions.copy()
all_descriptions.extend(test_descriptions.copy())
all_descriptions = np.array(all_descriptions)

captions = all_descriptions.copy()
max_length_all = max(max_length_train, max_length_test)
max_length = max_length_all + 2
      
lex = set()
for sen in all_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in all_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(all_paths)} images') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')


There are 10416 images
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.


In [8]:
all_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
all_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Glove Embeddings

In [10]:
embeddings_index = {} 
path = os.path.join(root_captioning, 'glove.6B.200d.txt') if AWS\
else os.path.join(root_captioning, 'raw', 'glove.6B.200d.txt')

f = open(
    path, 
    encoding="utf-8"
)

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:22, 17516.41it/s]

Found 400000 word vectors.





In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count =0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, cnn_type, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        if cnn_type == 'vgg16':
            self.model = models.vgg16(pretrained=pretrained)

            # remove the last two layers in classifier
            self.model.classifier = nn.Sequential(
              *list(self.model.classifier.children())[:-2]
            )
            self.input_size = 224     

        # inception v3 expects (299, 299) sized images
        elif cnn_type == 'inception_v3':
            self.model = models.inception_v3(pretrained=pretrained)
            # remove the classification layer
            self.model.fc = nn.Identity()

            # turn off auxiliary output
            self.model.aux_logits = False
            self.input_size = 299

        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
            # set the model to evaluation model
            self.model.eval()

        return self.model(img_input)

In [14]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
              'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [15]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

### Train the Neural Network

In [16]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        )
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [17]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
        ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx]
        img_features, captions = [], []
        for desc in self.descriptions[idx]:
            # convert each word into a list of sequences.
            seq = [self.wordtoidx[word] for word in desc.split(' ')
                  if word in self.wordtoidx]
            # pad the sequence with 0 on the right side
            in_seq = np.pad(
                seq, 
                (0, max_length - len(seq)),
                mode='constant',
                constant_values=(0, 0)
                )

            img_features.append(img)
            captions.append(in_seq)
    
        return (img_features, captions)


In [18]:
def my_collate(batch):
    """
    Processes the batch to return from the dataloader

    Parameters:
    -----------
    batch: tuple
      a batch from the Dataset

    Return:
    --------
    list
        [image feature matrix, captions, the length of each caption]
    """  

    img_features = [item[0] for item in batch]
    captions = [item[1] for item in batch]

    img_features = torch.FloatTensor(list(chain(*img_features)))
    captions = torch.LongTensor(list(chain(*captions)))

    return [img_features, captions]

In [19]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [20]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [21]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in img_paths:
        img_features.append(
            encode_image(model, image_path).cpu().data.numpy()
    )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [22]:
def get_train_test(
    encoder,
    train_paths,
    test_paths,
    cnn_type='inception_v3',
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    return train_img_features, test_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size,
        collate_fn=my_collate
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    cnn_type='inception_v3',
    hidden_size=256,
):

    caption_model = CaptionModel(
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=hidden_size,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [23]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [24]:
sys.path.append('../scr/evaluation/')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [25]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [None]:
cnn_type = 'inception_v3'
encoder = CNNModel(cnn_type, pretrained=True)
encoder.to(device)

In [27]:
def cross_validation(train_index, test_index, count):
    print('=' * 60)
    print(f'Split {count}:')
    print(f'Splitting data...')
    
    train_paths, test_paths = all_paths[train_index], all_paths[test_index]
    train_descriptions, test_descriptions = all_descriptions[train_index], all_descriptions[test_index]
    print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')
    
    vocab = get_vocab(train_descriptions, word_count_threshold=10)
    idxtoword, wordtoidx = get_word_dict(vocab)
    vocab_size = get_vocab_size(idxtoword)
    embedding_dim = 200
    embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

    print(f'Preparing dataloader...')
    train_img_features, test_img_features = get_train_test(encoder, train_paths, test_paths)

    train_loader = get_train_dataloader(
        train_descriptions, 
        train_img_features,
        wordtoidx,
        max_length,
        batch_size=200
    )

    print(f'Training...')
    caption_model = train_model(
        train_loader,
        vocab_size,
        embedding_dim, 
        embedding_matrix
    )

    
    ref = captions[test_index]
    model_score = evaluate_results(
        test_img_features, 
        caption_model,
        ref,
        max_length,
        vocab_size,
        wordtoidx,
        idxtoword
    )
    
    return caption_model, model_score

In [28]:
cv = KFold(n_splits=5)
cv = [(train_index, test_index) for train_index, test_index in cv.split(all_paths)]  

In [29]:
caption_model1, model_score1 = cross_validation(cv[0][0], cv[0][1], 1)    

Split 1:
Splitting data...
8332 images for training and 2084 images for testing.
There are 41660 captions
preprocessed words 2677 ==> 890
The vocabulary size is 891.
802 out of 891 words are found in the pre-trained matrix.
The size of embedding_matrix is (891, 200)
Preparing dataloader...

Generating set took: 0:03:28.78


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:52.29
Training...


  2%|▏         | 1/60 [00:12<12:14, 12.45s/it]

5.061539445604597


  3%|▎         | 2/60 [00:24<12:01, 12.44s/it]

3.415529041063218


  5%|▌         | 3/60 [00:37<11:48, 12.43s/it]

2.3480141900834584


  7%|▋         | 4/60 [00:49<11:36, 12.44s/it]

1.8497473682676042


  8%|▊         | 5/60 [01:02<11:24, 12.44s/it]

1.6512130158288139


 10%|█         | 6/60 [01:14<11:11, 12.44s/it]

1.5470238299596877


 12%|█▏        | 7/60 [01:27<11:00, 12.45s/it]

1.4730623563130696


 13%|█▎        | 8/60 [01:39<10:47, 12.45s/it]

1.42395814259847


 15%|█▌        | 9/60 [01:52<10:34, 12.45s/it]

1.387621936343965


 17%|█▋        | 10/60 [02:04<10:22, 12.45s/it]

1.3577437627883184


 18%|█▊        | 11/60 [02:16<10:09, 12.44s/it]

1.3264949662344796


 20%|██        | 12/60 [02:29<09:57, 12.44s/it]

1.307163499650501


 22%|██▏       | 13/60 [02:41<09:45, 12.46s/it]

1.2867059849557423


 23%|██▎       | 14/60 [02:54<09:32, 12.45s/it]

1.2675950271742684


 25%|██▌       | 15/60 [03:06<09:20, 12.46s/it]

1.251033638204847


 27%|██▋       | 16/60 [03:19<09:09, 12.49s/it]

1.2414475537481762


 28%|██▊       | 17/60 [03:31<08:56, 12.48s/it]

1.2288984798249745


 30%|███       | 18/60 [03:44<08:43, 12.47s/it]

1.2223421477136158


 32%|███▏      | 19/60 [03:56<08:31, 12.47s/it]

1.2105254105159216


 33%|███▎      | 20/60 [04:09<08:18, 12.47s/it]

1.1960423673902238


 35%|███▌      | 21/60 [04:21<08:06, 12.47s/it]

1.1848573911757696


 37%|███▋      | 22/60 [04:34<07:53, 12.46s/it]

1.1796573797861736


 38%|███▊      | 23/60 [04:46<07:40, 12.46s/it]

1.1710659833181472


 40%|████      | 24/60 [04:58<07:28, 12.46s/it]

1.1619729200998943


 42%|████▏     | 25/60 [05:11<07:15, 12.45s/it]

1.1564839028176808


 43%|████▎     | 26/60 [05:23<07:03, 12.45s/it]

1.1531430199032737


 45%|████▌     | 27/60 [05:36<06:50, 12.44s/it]

1.1501859994161696


 47%|████▋     | 28/60 [05:48<06:38, 12.44s/it]

1.1428108527546836


 48%|████▊     | 29/60 [06:01<06:25, 12.44s/it]

1.1374362778095972


 50%|█████     | 30/60 [06:13<06:13, 12.43s/it]

1.132144502231053


 52%|█████▏    | 31/60 [06:25<06:00, 12.44s/it]

1.130813412723087


 53%|█████▎    | 32/60 [06:38<05:48, 12.44s/it]

1.126885670991171


 55%|█████▌    | 33/60 [06:50<05:35, 12.44s/it]

1.1174832710197993


 57%|█████▋    | 34/60 [07:03<05:23, 12.45s/it]

1.108578724520547


 58%|█████▊    | 35/60 [07:15<05:10, 12.42s/it]

1.102591423761277


 60%|██████    | 36/60 [07:28<04:57, 12.41s/it]

1.0995446684814634


 62%|██████▏   | 37/60 [07:40<04:45, 12.40s/it]

1.094364269858315


 63%|██████▎   | 38/60 [07:52<04:32, 12.39s/it]

1.0922312708128066


 65%|██████▌   | 39/60 [08:05<04:20, 12.40s/it]

1.085567749681927


 67%|██████▋   | 40/60 [08:17<04:07, 12.39s/it]

1.0850058467615218


 68%|██████▊   | 41/60 [08:29<03:55, 12.38s/it]

1.0811359924929482


 70%|███████   | 42/60 [08:42<03:42, 12.39s/it]

1.080277293920517


 72%|███████▏  | 43/60 [08:54<03:30, 12.40s/it]

1.0732553658031283


 73%|███████▎  | 44/60 [09:07<03:18, 12.39s/it]

1.0718884496461778


 75%|███████▌  | 45/60 [09:19<03:05, 12.38s/it]

1.0704144963196345


 77%|███████▋  | 46/60 [09:32<02:54, 12.45s/it]

1.0668837087494987


 78%|███████▊  | 47/60 [09:44<02:41, 12.42s/it]

1.0625016079062508


 80%|████████  | 48/60 [09:56<02:28, 12.40s/it]

1.062633399452482


 82%|████████▏ | 49/60 [10:09<02:16, 12.40s/it]

1.05564622651963


 83%|████████▎ | 50/60 [10:21<02:03, 12.39s/it]

1.053062529790969


 85%|████████▌ | 51/60 [10:33<01:51, 12.38s/it]

1.0512879164445967


 87%|████████▋ | 52/60 [10:46<01:39, 12.38s/it]

1.0531748533248901


 88%|████████▊ | 53/60 [10:58<01:26, 12.37s/it]

1.0570108223529089


 90%|█████████ | 54/60 [11:11<01:14, 12.38s/it]

1.058265433424995


 92%|█████████▏| 55/60 [11:23<01:01, 12.39s/it]

1.0499863099484217


 93%|█████████▎| 56/60 [11:35<00:49, 12.38s/it]

1.0514321497508459


 95%|█████████▌| 57/60 [11:48<00:37, 12.37s/it]

1.0526321416809445


 97%|█████████▋| 58/60 [12:00<00:24, 12.37s/it]

1.0576172499429612


 98%|█████████▊| 59/60 [12:12<00:12, 12.36s/it]

1.0501221021016438


100%|██████████| 60/60 [12:25<00:00, 12.42s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

1.0432242864654178


  2%|▏         | 1/60 [00:12<12:08, 12.35s/it]

1.007885183606829


  3%|▎         | 2/60 [00:24<11:56, 12.36s/it]

0.9892388468696958


  5%|▌         | 3/60 [00:37<11:44, 12.37s/it]

0.980401295991171


  7%|▋         | 4/60 [00:49<11:32, 12.37s/it]

0.9747499284290132


  8%|▊         | 5/60 [01:01<11:20, 12.38s/it]

0.9709003028415498


 10%|█         | 6/60 [01:14<11:07, 12.37s/it]

0.9690824832235064


 12%|█▏        | 7/60 [01:26<10:55, 12.36s/it]

0.9659331086136046


 13%|█▎        | 8/60 [01:38<10:42, 12.36s/it]

0.9635042122432164


 15%|█▌        | 9/60 [01:51<10:30, 12.36s/it]

0.9610357951550257


 17%|█▋        | 10/60 [02:03<10:18, 12.36s/it]

0.9589165974230993


 18%|█▊        | 11/60 [02:16<10:05, 12.36s/it]

0.9571524773325238


 20%|██        | 12/60 [02:28<09:53, 12.37s/it]

0.9545823293072837


 22%|██▏       | 13/60 [02:40<09:41, 12.38s/it]

0.953568377665111


 23%|██▎       | 14/60 [02:53<09:29, 12.38s/it]

0.9525817590100425


 25%|██▌       | 15/60 [03:05<09:16, 12.37s/it]

0.9521897122973487


 27%|██▋       | 16/60 [03:17<09:04, 12.37s/it]

0.9498316134725299


 28%|██▊       | 17/60 [03:30<08:51, 12.36s/it]

0.9493394082500821


 30%|███       | 18/60 [03:42<08:39, 12.36s/it]

0.947900177467437


 32%|███▏      | 19/60 [03:54<08:26, 12.36s/it]

0.9471871143295651


 33%|███▎      | 20/60 [04:07<08:15, 12.38s/it]

0.945726874328795


 35%|███▌      | 21/60 [04:19<08:02, 12.38s/it]

0.9443650600456056


 37%|███▋      | 22/60 [04:32<07:50, 12.39s/it]

0.9451274446078709


 38%|███▊      | 23/60 [04:44<07:38, 12.40s/it]

0.9435074882847923


 40%|████      | 24/60 [04:56<07:26, 12.39s/it]

0.9423665219829196


 42%|████▏     | 25/60 [05:09<07:13, 12.39s/it]

0.9424104733126504


 43%|████▎     | 26/60 [05:21<07:01, 12.39s/it]

0.93997009737151


 45%|████▌     | 27/60 [05:34<06:48, 12.39s/it]

0.9400953026044936


 47%|████▋     | 28/60 [05:46<06:36, 12.39s/it]

0.9390803447791508


 48%|████▊     | 29/60 [05:58<06:23, 12.38s/it]

0.9391720976148333


 50%|█████     | 30/60 [06:11<06:11, 12.39s/it]

0.9377525888738178


 52%|█████▏    | 31/60 [06:23<05:59, 12.39s/it]

0.9377884297143846


 53%|█████▎    | 32/60 [06:36<05:46, 12.39s/it]

0.9369994004567465


 55%|█████▌    | 33/60 [06:48<05:34, 12.38s/it]

0.9352707550639198


 57%|█████▋    | 34/60 [07:00<05:21, 12.38s/it]

0.9343763958840143


 58%|█████▊    | 35/60 [07:13<05:09, 12.39s/it]

0.9337065106346494


 60%|██████    | 36/60 [07:25<04:57, 12.38s/it]

0.9332576209590548


 62%|██████▏   | 37/60 [07:37<04:44, 12.38s/it]

0.9331287131423042


 63%|██████▎   | 38/60 [07:50<04:32, 12.38s/it]

0.9325694754010155


 65%|██████▌   | 39/60 [08:02<04:19, 12.38s/it]

0.9308300827230725


 67%|██████▋   | 40/60 [08:15<04:07, 12.38s/it]

0.9311412814117613


 68%|██████▊   | 41/60 [08:27<03:55, 12.37s/it]

0.9315337581293923


 70%|███████   | 42/60 [08:39<03:42, 12.38s/it]

0.9300665557384491


 72%|███████▏  | 43/60 [08:52<03:30, 12.38s/it]

0.9297991593678793


 73%|███████▎  | 44/60 [09:04<03:18, 12.39s/it]

0.9280553105331603


 75%|███████▌  | 45/60 [09:17<03:05, 12.38s/it]

0.9295281044074467


 77%|███████▋  | 46/60 [09:29<02:53, 12.39s/it]

0.9285156116599128


 78%|███████▊  | 47/60 [09:41<02:41, 12.39s/it]

0.9276002631300971


 80%|████████  | 48/60 [09:54<02:28, 12.39s/it]

0.9266583735034579


 82%|████████▏ | 49/60 [10:06<02:16, 12.41s/it]

0.9271570117700667


 83%|████████▎ | 50/60 [10:19<02:04, 12.40s/it]

0.9266565044720968


 85%|████████▌ | 51/60 [10:31<01:51, 12.40s/it]

0.9262354189441317


 87%|████████▋ | 52/60 [10:43<01:39, 12.40s/it]

0.9251606861750284


 88%|████████▊ | 53/60 [10:56<01:26, 12.39s/it]

0.9239020234062558


 90%|█████████ | 54/60 [11:08<01:14, 12.39s/it]

0.925721226703553


 92%|█████████▏| 55/60 [11:20<01:01, 12.39s/it]

0.9228293938296181


 93%|█████████▎| 56/60 [11:33<00:49, 12.39s/it]

0.923561216819854


 95%|█████████▌| 57/60 [11:45<00:37, 12.39s/it]

0.9213090936342875


 97%|█████████▋| 58/60 [11:58<00:24, 12.38s/it]

0.9228670270670027


 98%|█████████▊| 59/60 [12:10<00:12, 12.38s/it]

0.922098852339245


100%|██████████| 60/60 [12:22<00:00, 12.38s/it]

0.9216009307475317
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [30]:
model_score1

{'Bleu_1': 0.6433619393516047,
 'Bleu_2': 0.5124391566040026,
 'Bleu_3': 0.425767703524256,
 'Bleu_4': 0.36393501416121626,
 'METEOR': 0.29322260324713706,
 'ROUGE_L': 0.5408537882511261,
 'CIDEr': 2.033263457119308,
 'SPICE': 0.38380602790134394,
 'USC_similarity': 0.6090219055836923}

In [31]:
caption_model2, model_score2 = cross_validation(cv[1][0], cv[1][1], 2)    

Split 2:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2691 ==> 898
The vocabulary size is 899.
808 out of 899 words are found in the pre-trained matrix.
The size of embedding_matrix is (899, 200)
Preparing dataloader...

Generating set took: 0:03:17.96


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:49.61
Training...


  2%|▏         | 1/60 [00:12<12:19, 12.54s/it]

4.199908795810881


  3%|▎         | 2/60 [00:25<12:07, 12.54s/it]

2.3847406251089915


  5%|▌         | 3/60 [00:37<11:54, 12.53s/it]

1.849181782631647


  7%|▋         | 4/60 [00:50<11:42, 12.54s/it]

1.61998793057033


  8%|▊         | 5/60 [01:02<11:29, 12.53s/it]

1.4885709938548861


 10%|█         | 6/60 [01:15<11:17, 12.54s/it]

1.4134283775375003


 12%|█▏        | 7/60 [01:27<11:03, 12.53s/it]

1.3557352366901578


 13%|█▎        | 8/60 [01:40<10:50, 12.52s/it]

1.3105252498672122


 15%|█▌        | 9/60 [01:52<10:40, 12.56s/it]

1.2757207183610826


 17%|█▋        | 10/60 [02:05<10:28, 12.57s/it]

1.247224265620822


 18%|█▊        | 11/60 [02:18<10:17, 12.60s/it]

1.2220468492735


 20%|██        | 12/60 [02:30<10:04, 12.60s/it]

1.2004722214880443


 22%|██▏       | 13/60 [02:43<09:51, 12.58s/it]

1.1809076070785522


 23%|██▎       | 14/60 [02:55<09:38, 12.57s/it]

1.1658079482260204


 25%|██▌       | 15/60 [03:08<09:25, 12.56s/it]

1.151860467025212


 27%|██▋       | 16/60 [03:20<09:12, 12.55s/it]

1.134528174286797


 28%|██▊       | 17/60 [03:33<09:00, 12.57s/it]

1.1197065498147691


 30%|███       | 18/60 [03:46<08:47, 12.56s/it]

1.1084173577172416


 32%|███▏      | 19/60 [03:58<08:34, 12.55s/it]

1.0999828846681685


 33%|███▎      | 20/60 [04:11<08:21, 12.55s/it]

1.0898855910414742


 35%|███▌      | 21/60 [04:23<08:09, 12.55s/it]

1.0850516188712347


 37%|███▋      | 22/60 [04:36<07:56, 12.54s/it]

1.0812195099535442


 38%|███▊      | 23/60 [04:48<07:43, 12.54s/it]

1.0724300827298845


 40%|████      | 24/60 [05:01<07:31, 12.54s/it]

1.0677837289514995


 42%|████▏     | 25/60 [05:13<07:18, 12.53s/it]

1.064663046882266


 43%|████▎     | 26/60 [05:26<07:06, 12.53s/it]

1.0562176491533006


 45%|████▌     | 27/60 [05:38<06:53, 12.53s/it]

1.0481104708853222


 47%|████▋     | 28/60 [05:51<06:41, 12.53s/it]

1.0399333834648132


 48%|████▊     | 29/60 [06:04<06:29, 12.57s/it]

1.0385064056941442


 50%|█████     | 30/60 [06:16<06:17, 12.58s/it]

1.029371406350817


 52%|█████▏    | 31/60 [06:29<06:04, 12.58s/it]

1.0290315775644212


 53%|█████▎    | 32/60 [06:41<05:52, 12.58s/it]

1.0216996513661885


 55%|█████▌    | 33/60 [06:54<05:39, 12.59s/it]

1.0158877017952146


 57%|█████▋    | 34/60 [07:07<05:27, 12.59s/it]

1.0111207834311895


 58%|█████▊    | 35/60 [07:19<05:14, 12.59s/it]

1.0043647019636064


 60%|██████    | 36/60 [07:32<05:02, 12.59s/it]

1.0042436875048137


 62%|██████▏   | 37/60 [07:44<04:49, 12.58s/it]

0.999358499333972


 63%|██████▎   | 38/60 [07:57<04:37, 12.60s/it]

1.0040585881187802


 65%|██████▌   | 39/60 [08:09<04:24, 12.60s/it]

0.9984321338789803


 67%|██████▋   | 40/60 [08:22<04:11, 12.59s/it]

0.9990299230530149


 68%|██████▊   | 41/60 [08:35<03:59, 12.61s/it]

0.9988595658824557


 70%|███████   | 42/60 [08:47<03:47, 12.62s/it]

0.9962948092392513


 72%|███████▏  | 43/60 [09:00<03:34, 12.59s/it]

0.9877668888795943


 73%|███████▎  | 44/60 [09:12<03:21, 12.56s/it]

0.9788719131833031


 75%|███████▌  | 45/60 [09:25<03:08, 12.55s/it]

0.9727773141293299


 77%|███████▋  | 46/60 [09:37<02:55, 12.53s/it]

0.964325453553881


 78%|███████▊  | 47/60 [09:50<02:42, 12.53s/it]

0.96128562944276


 80%|████████  | 48/60 [10:02<02:30, 12.53s/it]

0.9601702675932929


 82%|████████▏ | 49/60 [10:15<02:17, 12.52s/it]

0.9547776693389529


 83%|████████▎ | 50/60 [10:27<02:05, 12.51s/it]

0.9521107105981736


 85%|████████▌ | 51/60 [10:40<01:52, 12.51s/it]

0.9496493211814335


 87%|████████▋ | 52/60 [10:52<01:40, 12.52s/it]

0.9493210585344405


 88%|████████▊ | 53/60 [11:05<01:27, 12.51s/it]

0.9528330791564215


 90%|█████████ | 54/60 [11:17<01:15, 12.50s/it]

0.9557704443023318


 92%|█████████▏| 55/60 [11:30<01:02, 12.51s/it]

0.9515706272352309


 93%|█████████▎| 56/60 [11:42<00:50, 12.52s/it]

0.9469188026019505


 95%|█████████▌| 57/60 [11:55<00:37, 12.52s/it]

0.9416095060961587


 97%|█████████▋| 58/60 [12:08<00:25, 12.54s/it]

0.9408696378980365


 98%|█████████▊| 59/60 [12:20<00:12, 12.54s/it]

0.9435774272396451


100%|██████████| 60/60 [12:33<00:00, 12.55s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.943797440755935


  2%|▏         | 1/60 [00:12<12:18, 12.51s/it]

0.9077636585349128


  3%|▎         | 2/60 [00:25<12:06, 12.53s/it]

0.8837098933401561


  5%|▌         | 3/60 [00:37<11:53, 12.52s/it]

0.8740094744023823


  7%|▋         | 4/60 [00:50<11:42, 12.54s/it]

0.8676852342628297


  8%|▊         | 5/60 [01:02<11:29, 12.53s/it]

0.8621808120182582


 10%|█         | 6/60 [01:15<11:15, 12.52s/it]

0.8588324543975648


 12%|█▏        | 7/60 [01:27<11:03, 12.53s/it]

0.8557069400946299


 13%|█▎        | 8/60 [01:40<10:51, 12.54s/it]

0.8513509233792623


 15%|█▌        | 9/60 [01:52<10:38, 12.53s/it]

0.8521341028667632


 17%|█▋        | 10/60 [02:05<10:26, 12.52s/it]

0.8483464476608095


 18%|█▊        | 11/60 [02:17<10:14, 12.54s/it]

0.8472937019098372


 20%|██        | 12/60 [02:30<10:02, 12.55s/it]

0.8447115549019405


 22%|██▏       | 13/60 [02:43<09:50, 12.56s/it]

0.8432774004482088


 23%|██▎       | 14/60 [02:55<09:37, 12.55s/it]

0.8426649371782938


 25%|██▌       | 15/60 [03:08<09:24, 12.55s/it]

0.8391042223998478


 27%|██▋       | 16/60 [03:20<09:11, 12.54s/it]

0.8393073450951349


 28%|██▊       | 17/60 [03:33<08:58, 12.52s/it]

0.8378789439087823


 30%|███       | 18/60 [03:45<08:46, 12.54s/it]

0.8379558140323276


 32%|███▏      | 19/60 [03:58<08:33, 12.53s/it]

0.836150430497669


 33%|███▎      | 20/60 [04:10<08:20, 12.51s/it]

0.8344683746496836


 35%|███▌      | 21/60 [04:23<08:08, 12.52s/it]

0.8329724428199586


 37%|███▋      | 22/60 [04:35<07:55, 12.51s/it]

0.8318884798458644


 38%|███▊      | 23/60 [04:48<07:42, 12.51s/it]

0.8299341443039122


 40%|████      | 24/60 [05:00<07:29, 12.50s/it]

0.830556503364018


 42%|████▏     | 25/60 [05:13<07:17, 12.50s/it]

0.8294343267168317


 43%|████▎     | 26/60 [05:25<07:04, 12.50s/it]

0.8279714527584258


 45%|████▌     | 27/60 [05:38<06:52, 12.51s/it]

0.8270357024101984


 47%|████▋     | 28/60 [05:50<06:40, 12.50s/it]

0.8277604821182433


 48%|████▊     | 29/60 [06:03<06:27, 12.50s/it]

0.8258240293888819


 50%|█████     | 30/60 [06:15<06:15, 12.51s/it]

0.8266866022632235


 52%|█████▏    | 31/60 [06:28<06:02, 12.51s/it]

0.8240276958261218


 53%|█████▎    | 32/60 [06:40<05:50, 12.52s/it]

0.8248078737940107


 55%|█████▌    | 33/60 [06:53<05:38, 12.54s/it]

0.8212344845136007


 57%|█████▋    | 34/60 [07:05<05:26, 12.56s/it]

0.8239563987368629


 58%|█████▊    | 35/60 [07:18<05:14, 12.57s/it]

0.821996047383263


 60%|██████    | 36/60 [07:31<05:01, 12.56s/it]

0.8204101622104645


 62%|██████▏   | 37/60 [07:43<04:48, 12.55s/it]

0.8199203709761301


 63%|██████▎   | 38/60 [07:56<04:35, 12.54s/it]

0.8197420693579174


 65%|██████▌   | 39/60 [08:08<04:23, 12.57s/it]

0.8192503807090578


 67%|██████▋   | 40/60 [08:21<04:12, 12.61s/it]

0.8175025667463031


 68%|██████▊   | 41/60 [08:34<03:59, 12.61s/it]

0.8176715118544442


 70%|███████   | 42/60 [08:46<03:47, 12.62s/it]

0.8166467774482


 72%|███████▏  | 43/60 [08:59<03:34, 12.62s/it]

0.8169488438538143


 73%|███████▎  | 44/60 [09:11<03:21, 12.61s/it]

0.8151552166257586


 75%|███████▌  | 45/60 [09:24<03:09, 12.63s/it]

0.8141569452626365


 77%|███████▋  | 46/60 [09:37<02:56, 12.62s/it]

0.8146075932752519


 78%|███████▊  | 47/60 [09:49<02:44, 12.62s/it]

0.8139341643878392


 80%|████████  | 48/60 [10:02<02:31, 12.62s/it]

0.8136789614246005


 82%|████████▏ | 49/60 [10:15<02:19, 12.69s/it]

0.8122156957785288


 83%|████████▎ | 50/60 [10:28<02:07, 12.76s/it]

0.8129460499400184


 85%|████████▌ | 51/60 [10:40<01:54, 12.69s/it]

0.8126505059855325


 87%|████████▋ | 52/60 [10:53<01:41, 12.64s/it]

0.8117500997724987


 88%|████████▊ | 53/60 [11:05<01:28, 12.61s/it]

0.8105763594309489


 90%|█████████ | 54/60 [11:18<01:15, 12.59s/it]

0.8110756930850801


 92%|█████████▏| 55/60 [11:30<01:02, 12.58s/it]

0.8090228594484783


 93%|█████████▎| 56/60 [11:43<00:50, 12.56s/it]

0.8087945239884513


 95%|█████████▌| 57/60 [11:55<00:37, 12.55s/it]

0.8099262473129091


 97%|█████████▋| 58/60 [12:08<00:25, 12.54s/it]

0.8086679662976947


 98%|█████████▊| 59/60 [12:20<00:12, 12.54s/it]

0.8064293634323847


100%|██████████| 60/60 [12:33<00:00, 12.56s/it]

0.8078114716779619
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [32]:
model_score2

{'Bleu_1': 0.6436207135915293,
 'Bleu_2': 0.5200530604186809,
 'Bleu_3': 0.4380441113568216,
 'Bleu_4': 0.37843428605241924,
 'METEOR': 0.29974856806056854,
 'ROUGE_L': 0.5465454126950626,
 'CIDEr': 2.0607765548283257,
 'SPICE': 0.3854655930294456,
 'USC_similarity': 0.6039577824696254}

In [33]:
caption_model3, model_score3 = cross_validation(cv[2][0], cv[2][1], 3)

Split 3:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2661 ==> 895
The vocabulary size is 896.
809 out of 896 words are found in the pre-trained matrix.
The size of embedding_matrix is (896, 200)
Preparing dataloader...

Generating set took: 0:03:19.30


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:49.61
Training...


  2%|▏         | 1/60 [00:12<12:13, 12.44s/it]

4.4357317969912575


  3%|▎         | 2/60 [00:24<12:01, 12.45s/it]

2.2210389517602467


  5%|▌         | 3/60 [00:37<11:49, 12.45s/it]

1.7122943912233626


  7%|▋         | 4/60 [00:49<11:38, 12.47s/it]

1.52012285448256


  8%|▊         | 5/60 [01:02<11:27, 12.49s/it]

1.4227985541025798


 10%|█         | 6/60 [01:14<11:14, 12.50s/it]

1.356384671869732


 12%|█▏        | 7/60 [01:27<11:03, 12.52s/it]

1.304252374739874


 13%|█▎        | 8/60 [01:40<10:51, 12.53s/it]

1.263804983525049


 15%|█▌        | 9/60 [01:52<10:39, 12.53s/it]

1.2380003333091736


 17%|█▋        | 10/60 [02:05<10:26, 12.53s/it]

1.2126920790899367


 18%|█▊        | 11/60 [02:17<10:14, 12.54s/it]

1.1932530062539237


 20%|██        | 12/60 [02:30<10:01, 12.54s/it]

1.1749275128046672


 22%|██▏       | 13/60 [02:42<09:49, 12.55s/it]

1.1569533149401348


 23%|██▎       | 14/60 [02:55<09:37, 12.55s/it]

1.1445105047453017


 25%|██▌       | 15/60 [03:07<09:24, 12.55s/it]

1.1289125155834925


 27%|██▋       | 16/60 [03:20<09:11, 12.54s/it]

1.1155512446448916


 28%|██▊       | 17/60 [03:32<08:58, 12.52s/it]

1.1068176073687417


 30%|███       | 18/60 [03:45<08:45, 12.51s/it]

1.0954716688110715


 32%|███▏      | 19/60 [03:57<08:32, 12.50s/it]

1.0806660850842793


 33%|███▎      | 20/60 [04:10<08:20, 12.50s/it]

1.071215467793601


 35%|███▌      | 21/60 [04:22<08:08, 12.53s/it]

1.0623580061254048


 37%|███▋      | 22/60 [04:35<07:55, 12.52s/it]

1.0544914447125935


 38%|███▊      | 23/60 [04:47<07:43, 12.52s/it]

1.0459020464193254


 40%|████      | 24/60 [05:00<07:30, 12.53s/it]

1.040871025550933


 42%|████▏     | 25/60 [05:12<07:17, 12.51s/it]

1.0384621307963418


 43%|████▎     | 26/60 [05:25<07:05, 12.51s/it]

1.033344730025246


 45%|████▌     | 27/60 [05:37<06:52, 12.51s/it]

1.0278753780183338


 47%|████▋     | 28/60 [05:50<06:40, 12.50s/it]

1.0287365374110995


 48%|████▊     | 29/60 [06:02<06:27, 12.50s/it]

1.0229364803859167


 50%|█████     | 30/60 [06:15<06:14, 12.49s/it]

1.0169581927004314


 52%|█████▏    | 31/60 [06:27<06:02, 12.49s/it]

1.0100744877542769


 53%|█████▎    | 32/60 [06:40<05:49, 12.48s/it]

1.0067670444647472


 55%|█████▌    | 33/60 [06:52<05:36, 12.48s/it]

1.0052917755785442


 57%|█████▋    | 34/60 [07:05<05:24, 12.48s/it]

1.0018676008496965


 58%|█████▊    | 35/60 [07:17<05:12, 12.51s/it]

0.9932512115864527


 60%|██████    | 36/60 [07:30<05:00, 12.51s/it]

0.9898550169808524


 62%|██████▏   | 37/60 [07:42<04:47, 12.51s/it]

0.9873077968756357


 63%|██████▎   | 38/60 [07:55<04:35, 12.52s/it]

0.9862811437674931


 65%|██████▌   | 39/60 [08:07<04:22, 12.52s/it]

0.9867028168269566


 67%|██████▋   | 40/60 [08:20<04:10, 12.51s/it]

0.9902869420392173


 68%|██████▊   | 41/60 [08:33<03:57, 12.52s/it]

0.9971905904156821


 70%|███████   | 42/60 [08:45<03:45, 12.51s/it]

0.9970915473642803


 72%|███████▏  | 43/60 [08:57<03:32, 12.49s/it]

0.9955553852376484


 73%|███████▎  | 44/60 [09:10<03:19, 12.48s/it]

0.9873166992550805


 75%|███████▌  | 45/60 [09:22<03:07, 12.49s/it]

0.986231559798831


 77%|███████▋  | 46/60 [09:35<02:54, 12.49s/it]

0.9915993142695654


 78%|███████▊  | 47/60 [09:47<02:42, 12.49s/it]

0.9899258429095859


 80%|████████  | 48/60 [10:00<02:29, 12.49s/it]

0.9840344786643982


 82%|████████▏ | 49/60 [10:12<02:17, 12.48s/it]

0.9828196309861683


 83%|████████▎ | 50/60 [10:25<02:04, 12.49s/it]

0.9824790372734978


 85%|████████▌ | 51/60 [10:37<01:52, 12.50s/it]

0.9753774589016324


 87%|████████▋ | 52/60 [10:50<01:40, 12.52s/it]

0.9683591354460943


 88%|████████▊ | 53/60 [11:02<01:27, 12.50s/it]

0.9602916850930169


 90%|█████████ | 54/60 [11:15<01:15, 12.50s/it]

0.9526362007572537


 92%|█████████▏| 55/60 [11:27<01:02, 12.50s/it]

0.9511283678667886


 93%|█████████▎| 56/60 [11:40<00:50, 12.50s/it]

0.9495830890678224


 95%|█████████▌| 57/60 [11:52<00:37, 12.50s/it]

0.9498859345912933


 97%|█████████▋| 58/60 [12:05<00:24, 12.50s/it]

0.9497938170319512


 98%|█████████▊| 59/60 [12:17<00:12, 12.49s/it]

0.9521999316556113


100%|██████████| 60/60 [12:30<00:00, 12.51s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.9478921975408282


  2%|▏         | 1/60 [00:12<12:16, 12.48s/it]

0.9181417978945232


  3%|▎         | 2/60 [00:24<12:03, 12.48s/it]

0.889737164690381


  5%|▌         | 3/60 [00:37<11:51, 12.48s/it]

0.8796846497626531


  7%|▋         | 4/60 [00:49<11:38, 12.48s/it]

0.8739562403588068


  8%|▊         | 5/60 [01:02<11:26, 12.48s/it]

0.8702858005251203


 10%|█         | 6/60 [01:14<11:13, 12.48s/it]

0.866308256274178


 12%|█▏        | 7/60 [01:27<11:01, 12.48s/it]

0.8641389395509448


 13%|█▎        | 8/60 [01:39<10:49, 12.49s/it]

0.8614139940057483


 15%|█▌        | 9/60 [01:52<10:36, 12.48s/it]

0.858146295661018


 17%|█▋        | 10/60 [02:04<10:24, 12.49s/it]

0.8572258906705039


 18%|█▊        | 11/60 [02:17<10:11, 12.48s/it]

0.8542059035528273


 20%|██        | 12/60 [02:29<09:58, 12.48s/it]

0.8527400266556513


 22%|██▏       | 13/60 [02:42<09:45, 12.47s/it]

0.8519829625175113


 23%|██▎       | 14/60 [02:54<09:33, 12.46s/it]

0.8501151076384953


 25%|██▌       | 15/60 [03:07<09:20, 12.46s/it]

0.8503827651341757


 27%|██▋       | 16/60 [03:19<09:08, 12.46s/it]

0.8467037337166923


 28%|██▊       | 17/60 [03:32<08:55, 12.45s/it]

0.847802133787246


 30%|███       | 18/60 [03:44<08:42, 12.44s/it]

0.8456227836154756


 32%|███▏      | 19/60 [03:56<08:30, 12.44s/it]

0.8442193014281136


 33%|███▎      | 20/60 [04:09<08:17, 12.44s/it]

0.8425658770969936


 35%|███▌      | 21/60 [04:21<08:05, 12.44s/it]

0.843237137510663


 37%|███▋      | 22/60 [04:34<07:52, 12.44s/it]

0.8417965273062388


 38%|███▊      | 23/60 [04:46<07:40, 12.44s/it]

0.8400880367982955


 40%|████      | 24/60 [04:59<07:27, 12.44s/it]

0.8388060870624724


 42%|████▏     | 25/60 [05:11<07:15, 12.45s/it]

0.8376251388163793


 43%|████▎     | 26/60 [05:24<07:03, 12.46s/it]

0.8380152327673775


 45%|████▌     | 27/60 [05:36<06:51, 12.46s/it]

0.8374891834599631


 47%|████▋     | 28/60 [05:48<06:38, 12.46s/it]

0.8360263251122975


 48%|████▊     | 29/60 [06:01<06:26, 12.46s/it]

0.8359311521053314


 50%|█████     | 30/60 [06:14<06:15, 12.51s/it]

0.8351704052516392


 52%|█████▏    | 31/60 [06:26<06:02, 12.51s/it]

0.8341288807846251


 53%|█████▎    | 32/60 [06:39<05:49, 12.50s/it]

0.8338300713471004


 55%|█████▌    | 33/60 [06:51<05:37, 12.49s/it]

0.833379830632891


 57%|█████▋    | 34/60 [07:03<05:24, 12.49s/it]

0.8324419728347233


 58%|█████▊    | 35/60 [07:16<05:12, 12.49s/it]

0.830774941614696


 60%|██████    | 36/60 [07:28<04:59, 12.49s/it]

0.83106672338077


 62%|██████▏   | 37/60 [07:41<04:47, 12.48s/it]

0.830756432953335


 63%|██████▎   | 38/60 [07:53<04:34, 12.49s/it]

0.8297240563801357


 65%|██████▌   | 39/60 [08:06<04:22, 12.52s/it]

0.8290135988167354


 67%|██████▋   | 40/60 [08:19<04:10, 12.54s/it]

0.8272715423788343


 68%|██████▊   | 41/60 [08:31<03:58, 12.55s/it]

0.8286888556821006


 70%|███████   | 42/60 [08:44<03:45, 12.55s/it]

0.8280158113865626


 72%|███████▏  | 43/60 [08:56<03:33, 12.55s/it]

0.825961057628904


 73%|███████▎  | 44/60 [09:09<03:20, 12.55s/it]

0.825083600623267


 75%|███████▌  | 45/60 [09:21<03:08, 12.56s/it]

0.8254053947471437


 77%|███████▋  | 46/60 [09:34<02:55, 12.56s/it]

0.8250888940833864


 78%|███████▊  | 47/60 [09:47<02:43, 12.55s/it]

0.8241074425833566


 80%|████████  | 48/60 [09:59<02:30, 12.55s/it]

0.8247050742308298


 82%|████████▏ | 49/60 [10:12<02:18, 12.57s/it]

0.8250125376951127


 83%|████████▎ | 50/60 [10:24<02:05, 12.57s/it]

0.8227447867393494


 85%|████████▌ | 51/60 [10:37<01:53, 12.56s/it]

0.8220549268381936


 87%|████████▋ | 52/60 [10:49<01:40, 12.55s/it]

0.8222549841517494


 88%|████████▊ | 53/60 [11:02<01:27, 12.55s/it]

0.8206840668405805


 90%|█████████ | 54/60 [11:14<01:15, 12.57s/it]

0.8207033390090579


 92%|█████████▏| 55/60 [11:27<01:02, 12.57s/it]

0.8210242617697943


 93%|█████████▎| 56/60 [11:40<00:50, 12.56s/it]

0.8198776756014142


 95%|█████████▌| 57/60 [11:52<00:37, 12.55s/it]

0.8189336998122079


 97%|█████████▋| 58/60 [12:05<00:25, 12.56s/it]

0.8193422172750745


 98%|█████████▊| 59/60 [12:17<00:12, 12.58s/it]

0.8184238459382739


100%|██████████| 60/60 [12:30<00:00, 12.51s/it]

0.8188428992316836
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [34]:
model_score3

{'Bleu_1': 0.6312827815171983,
 'Bleu_2': 0.5041238851974881,
 'Bleu_3': 0.42174541870958443,
 'Bleu_4': 0.3625381292115647,
 'METEOR': 0.2893236559943383,
 'ROUGE_L': 0.5348855814133673,
 'CIDEr': 2.0233001038244374,
 'SPICE': 0.38643453507233366,
 'USC_similarity': 0.6018355501415963}

In [35]:
caption_model4, model_score4 = cross_validation(cv[3][0], cv[3][1], 4)    

Split 4:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2681 ==> 916
The vocabulary size is 917.
823 out of 917 words are found in the pre-trained matrix.
The size of embedding_matrix is (917, 200)
Preparing dataloader...

Generating set took: 0:03:16.21


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:49.11
Training...


  2%|▏         | 1/60 [00:12<12:25, 12.63s/it]

4.519053691909427


  3%|▎         | 2/60 [00:25<12:11, 12.62s/it]

2.4139422376950583


  5%|▌         | 3/60 [00:37<11:58, 12.60s/it]

1.7863284321058364


  7%|▋         | 4/60 [00:50<11:45, 12.59s/it]

1.5554163030215673


  8%|▊         | 5/60 [01:02<11:32, 12.59s/it]

1.4368208930605935


 10%|█         | 6/60 [01:15<11:19, 12.58s/it]

1.3639103770256042


 12%|█▏        | 7/60 [01:28<11:06, 12.58s/it]

1.3061725667544775


 13%|█▎        | 8/60 [01:40<10:52, 12.56s/it]

1.2690590279442924


 15%|█▌        | 9/60 [01:53<10:40, 12.55s/it]

1.237239386354174


 17%|█▋        | 10/60 [02:05<10:27, 12.54s/it]

1.207058262257349


 18%|█▊        | 11/60 [02:18<10:14, 12.54s/it]

1.182864853313991


 20%|██        | 12/60 [02:30<10:01, 12.52s/it]

1.1623894770940144


 22%|██▏       | 13/60 [02:43<09:48, 12.51s/it]

1.142441763764336


 23%|██▎       | 14/60 [02:55<09:35, 12.51s/it]

1.123567371141343


 25%|██▌       | 15/60 [03:08<09:22, 12.51s/it]

1.1101998445533572


 27%|██▋       | 16/60 [03:20<09:10, 12.51s/it]

1.1050212255546026


 28%|██▊       | 17/60 [03:33<08:58, 12.52s/it]

1.0969439688183011


 30%|███       | 18/60 [03:45<08:45, 12.51s/it]

1.0906242047037398


 32%|███▏      | 19/60 [03:58<08:32, 12.51s/it]

1.0726594343071891


 33%|███▎      | 20/60 [04:10<08:20, 12.51s/it]

1.0645898083845775


 35%|███▌      | 21/60 [04:23<08:08, 12.52s/it]

1.0555482790583657


 37%|███▋      | 22/60 [04:35<07:56, 12.55s/it]

1.0473473284925734


 38%|███▊      | 23/60 [04:48<07:44, 12.55s/it]

1.0395957160563696


 40%|████      | 24/60 [05:00<07:31, 12.55s/it]

1.0317374467849731


 42%|████▏     | 25/60 [05:13<07:18, 12.53s/it]

1.029288269224621


 43%|████▎     | 26/60 [05:26<07:06, 12.53s/it]

1.0199014785743894


 45%|████▌     | 27/60 [05:38<06:53, 12.54s/it]

1.014099962654568


 47%|████▋     | 28/60 [05:51<06:42, 12.56s/it]

1.0070680323101224


 48%|████▊     | 29/60 [06:03<06:29, 12.55s/it]

0.9947453481810433


 50%|█████     | 30/60 [06:16<06:16, 12.54s/it]

0.9864001870155334


 52%|█████▏    | 31/60 [06:28<06:03, 12.55s/it]

0.983230627718426


 53%|█████▎    | 32/60 [06:41<05:51, 12.55s/it]

0.9811587986491975


 55%|█████▌    | 33/60 [06:53<05:38, 12.54s/it]

0.9761438483283633


 57%|█████▋    | 34/60 [07:06<05:25, 12.53s/it]

0.9694148145970845


 58%|█████▊    | 35/60 [07:18<05:13, 12.54s/it]

0.9663075435729254


 60%|██████    | 36/60 [07:31<05:00, 12.54s/it]

0.9626938445227486


 62%|██████▏   | 37/60 [07:43<04:47, 12.52s/it]

0.9626873163949876


 63%|██████▎   | 38/60 [07:56<04:35, 12.51s/it]

0.961218848114922


 65%|██████▌   | 39/60 [08:08<04:22, 12.49s/it]

0.9622208774089813


 67%|██████▋   | 40/60 [08:21<04:09, 12.50s/it]

0.9635921120643616


 68%|██████▊   | 41/60 [08:33<03:57, 12.51s/it]

0.9681816086882636


 70%|███████   | 42/60 [08:46<03:45, 12.50s/it]

0.9651105858030773


 72%|███████▏  | 43/60 [08:58<03:32, 12.50s/it]

0.9624761115937006


 73%|███████▎  | 44/60 [09:11<03:20, 12.51s/it]

0.9586722893374306


 75%|███████▌  | 45/60 [09:23<03:07, 12.51s/it]

0.9493792057037354


 77%|███████▋  | 46/60 [09:36<02:55, 12.51s/it]

0.9417619520709628


 78%|███████▊  | 47/60 [09:48<02:42, 12.50s/it]

0.9358508132752918


 80%|████████  | 48/60 [10:01<02:30, 12.52s/it]

0.9323240305696215


 82%|████████▏ | 49/60 [10:14<02:17, 12.51s/it]

0.9276525534334636


 83%|████████▎ | 50/60 [10:26<02:05, 12.51s/it]

0.9254685200396038


 85%|████████▌ | 51/60 [10:39<01:52, 12.54s/it]

0.9233989076954978


 87%|████████▋ | 52/60 [10:51<01:40, 12.55s/it]

0.9229853181611924


 88%|████████▊ | 53/60 [11:04<01:27, 12.55s/it]

0.9246469012328556


 90%|█████████ | 54/60 [11:16<01:15, 12.54s/it]

0.9246169953119188


 92%|█████████▏| 55/60 [11:29<01:02, 12.55s/it]

0.9199336909112477


 93%|█████████▎| 56/60 [11:41<00:50, 12.54s/it]

0.9206023840677171


 95%|█████████▌| 57/60 [11:54<00:37, 12.54s/it]

0.9181022842725118


 97%|█████████▋| 58/60 [12:06<00:25, 12.54s/it]

0.9206832306725639


 98%|█████████▊| 59/60 [12:19<00:12, 12.54s/it]

0.9198590389319828


100%|██████████| 60/60 [12:32<00:00, 12.53s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.9166550806590489


  2%|▏         | 1/60 [00:12<12:19, 12.54s/it]

0.8733527248813993


  3%|▎         | 2/60 [00:25<12:07, 12.55s/it]

0.852819779089519


  5%|▌         | 3/60 [00:37<11:55, 12.55s/it]

0.8440144572939191


  7%|▋         | 4/60 [00:50<11:41, 12.53s/it]

0.8385409670216697


  8%|▊         | 5/60 [01:02<11:29, 12.54s/it]

0.8339821852388836


 10%|█         | 6/60 [01:15<11:17, 12.54s/it]

0.831652120465324


 12%|█▏        | 7/60 [01:27<11:04, 12.55s/it]

0.8277944028377533


 13%|█▎        | 8/60 [01:40<10:52, 12.55s/it]

0.824514301050277


 15%|█▌        | 9/60 [01:52<10:40, 12.55s/it]

0.8207776546478271


 17%|█▋        | 10/60 [02:05<10:27, 12.55s/it]

0.8194777724288759


 18%|█▊        | 11/60 [02:18<10:16, 12.57s/it]

0.8191014655998775


 20%|██        | 12/60 [02:30<10:02, 12.56s/it]

0.8177459935347239


 22%|██▏       | 13/60 [02:43<09:50, 12.57s/it]

0.8153455399331593


 23%|██▎       | 14/60 [02:55<09:38, 12.57s/it]

0.8125223375502086


 25%|██▌       | 15/60 [03:08<09:25, 12.56s/it]

0.8105664111319042


 27%|██▋       | 16/60 [03:20<09:11, 12.54s/it]

0.8115312315168834


 28%|██▊       | 17/60 [03:33<08:59, 12.55s/it]

0.810076212599164


 30%|███       | 18/60 [03:45<08:46, 12.53s/it]

0.8096178727490562


 32%|███▏      | 19/60 [03:58<08:34, 12.54s/it]

0.8073254710152036


 33%|███▎      | 20/60 [04:10<08:21, 12.54s/it]

0.8073609897068569


 35%|███▌      | 21/60 [04:23<08:09, 12.54s/it]

0.805347128992989


 37%|███▋      | 22/60 [04:36<07:56, 12.54s/it]

0.8048046571867806


 38%|███▊      | 23/60 [04:48<07:43, 12.53s/it]

0.8043346149580819


 40%|████      | 24/60 [05:01<07:31, 12.53s/it]

0.8033751958892459


 42%|████▏     | 25/60 [05:13<07:18, 12.54s/it]

0.8015066470418658


 43%|████▎     | 26/60 [05:26<07:06, 12.53s/it]

0.8002029132275355


 45%|████▌     | 27/60 [05:38<06:54, 12.55s/it]

0.7994028017634437


 47%|████▋     | 28/60 [05:51<06:42, 12.58s/it]

0.7984624944982075


 48%|████▊     | 29/60 [06:04<06:30, 12.59s/it]

0.7984229383014497


 50%|█████     | 30/60 [06:16<06:17, 12.59s/it]

0.7986970103922344


 52%|█████▏    | 31/60 [06:29<06:04, 12.59s/it]

0.7980703555402302


 53%|█████▎    | 32/60 [06:41<05:51, 12.57s/it]

0.7958196713810876


 55%|█████▌    | 33/60 [06:54<05:39, 12.57s/it]

0.795806219180425


 57%|█████▋    | 34/60 [07:06<05:26, 12.57s/it]

0.7951243874572572


 58%|█████▊    | 35/60 [07:19<05:14, 12.56s/it]

0.7951311653568631


 60%|██████    | 36/60 [07:31<05:00, 12.54s/it]

0.7935927098705655


 62%|██████▏   | 37/60 [07:44<04:48, 12.53s/it]

0.792754624571119


 63%|██████▎   | 38/60 [07:56<04:35, 12.54s/it]

0.7927000522613525


 65%|██████▌   | 39/60 [08:09<04:23, 12.55s/it]

0.7928065345400855


 67%|██████▋   | 40/60 [08:22<04:11, 12.56s/it]

0.7927972376346588


 68%|██████▊   | 41/60 [08:34<03:58, 12.55s/it]

0.7897349099318186


 70%|███████   | 42/60 [08:47<03:45, 12.54s/it]

0.7902057142484755


 72%|███████▏  | 43/60 [08:59<03:33, 12.54s/it]

0.7891460628736586


 73%|███████▎  | 44/60 [09:12<03:20, 12.54s/it]

0.7890806893507639


 75%|███████▌  | 45/60 [09:24<03:08, 12.55s/it]

0.788690406651724


 77%|███████▋  | 46/60 [09:37<02:55, 12.57s/it]

0.7876905671187809


 78%|███████▊  | 47/60 [09:50<02:43, 12.58s/it]

0.7871469174112592


 80%|████████  | 48/60 [10:02<02:30, 12.58s/it]

0.7881639032136827


 82%|████████▏ | 49/60 [10:15<02:18, 12.59s/it]

0.7870297815118518


 83%|████████▎ | 50/60 [10:27<02:06, 12.62s/it]

0.7852940587770372


 85%|████████▌ | 51/60 [10:40<01:53, 12.61s/it]

0.7851598007338387


 87%|████████▋ | 52/60 [10:53<01:40, 12.61s/it]

0.7849789545649574


 88%|████████▊ | 53/60 [11:05<01:28, 12.62s/it]

0.7838150830495925


 90%|█████████ | 54/60 [11:18<01:15, 12.63s/it]

0.782554356824784


 92%|█████████▏| 55/60 [11:31<01:03, 12.63s/it]

0.782193049078896


 93%|█████████▎| 56/60 [11:43<00:50, 12.61s/it]

0.7827613055706024


 95%|█████████▌| 57/60 [11:56<00:37, 12.62s/it]

0.7831255793571472


 97%|█████████▋| 58/60 [12:08<00:25, 12.63s/it]

0.7829933521293458


 98%|█████████▊| 59/60 [12:21<00:12, 12.62s/it]

0.7801376865023658


100%|██████████| 60/60 [12:34<00:00, 12.57s/it]

0.7822408860638028
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [36]:
model_score4

{'Bleu_1': 0.6301631546644149,
 'Bleu_2': 0.5004495400753254,
 'Bleu_3': 0.41538561758785875,
 'Bleu_4': 0.3536924159464762,
 'METEOR': 0.2882922885119187,
 'ROUGE_L': 0.5350697599834644,
 'CIDEr': 2.053736396326587,
 'SPICE': 0.38647351623042264,
 'USC_similarity': 0.6040975116287187}

In [37]:
caption_model5, model_score5 = cross_validation(cv[4][0], cv[4][1], 5)    

Split 5:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2706 ==> 901
The vocabulary size is 902.
814 out of 902 words are found in the pre-trained matrix.
The size of embedding_matrix is (902, 200)
Preparing dataloader...

Generating set took: 0:03:18.19


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:50.12
Training...


  2%|▏         | 1/60 [00:12<12:21, 12.57s/it]

3.9844275883265903


  3%|▎         | 2/60 [00:25<12:08, 12.56s/it]

2.182256653195336


  5%|▌         | 3/60 [00:37<11:54, 12.54s/it]

1.7017710095360166


  7%|▋         | 4/60 [00:50<11:42, 12.54s/it]

1.5124754820551192


  8%|▊         | 5/60 [01:02<11:29, 12.54s/it]

1.4137320177895683


 10%|█         | 6/60 [01:15<11:16, 12.54s/it]

1.3485796593484425


 12%|█▏        | 7/60 [01:27<11:04, 12.54s/it]

1.298544012364887


 13%|█▎        | 8/60 [01:40<10:52, 12.55s/it]

1.264942467212677


 15%|█▌        | 9/60 [01:52<10:39, 12.54s/it]

1.2300103391919817


 17%|█▋        | 10/60 [02:05<10:28, 12.57s/it]

1.2001391847928364


 18%|█▊        | 11/60 [02:18<10:16, 12.59s/it]

1.1769546071688335


 20%|██        | 12/60 [02:30<10:03, 12.57s/it]

1.1560814210346766


 22%|██▏       | 13/60 [02:43<09:51, 12.58s/it]

1.1412709795293354


 23%|██▎       | 14/60 [02:55<09:37, 12.56s/it]

1.127429722320466


 25%|██▌       | 15/60 [03:08<09:24, 12.54s/it]

1.1165304950305395


 27%|██▋       | 16/60 [03:20<09:11, 12.53s/it]

1.102703603960219


 28%|██▊       | 17/60 [03:33<08:58, 12.52s/it]

1.0908478555225192


 30%|███       | 18/60 [03:45<08:45, 12.52s/it]

1.0901136767296564


 32%|███▏      | 19/60 [03:58<08:33, 12.53s/it]

1.077969681648981


 33%|███▎      | 20/60 [04:10<08:21, 12.53s/it]

1.0686297288962774


 35%|███▌      | 21/60 [04:23<08:08, 12.54s/it]

1.0591148081279935


 37%|███▋      | 22/60 [04:35<07:56, 12.54s/it]

1.039245999994732


 38%|███▊      | 23/60 [04:48<07:44, 12.56s/it]

1.032161758059547


 40%|████      | 24/60 [05:01<07:32, 12.56s/it]

1.0294883378914423


 42%|████▏     | 25/60 [05:13<07:19, 12.56s/it]

1.0320031841595967


 43%|████▎     | 26/60 [05:26<07:06, 12.56s/it]

1.0324960507097698


 45%|████▌     | 27/60 [05:38<06:54, 12.55s/it]

1.0377905879701887


 47%|████▋     | 28/60 [05:51<06:41, 12.55s/it]

1.0366438315028237


 48%|████▊     | 29/60 [06:03<06:29, 12.55s/it]

1.0154377250444322


 50%|█████     | 30/60 [06:16<06:16, 12.55s/it]

0.9985571730704534


 52%|█████▏    | 31/60 [06:28<06:03, 12.54s/it]

0.9923622735909053


 53%|█████▎    | 32/60 [06:41<05:50, 12.53s/it]

0.983619864497866


 55%|█████▌    | 33/60 [06:53<05:38, 12.53s/it]

0.9788032330217815


 57%|█████▋    | 34/60 [07:06<05:25, 12.52s/it]

0.9751856043225243


 58%|█████▊    | 35/60 [07:19<05:13, 12.53s/it]

0.9710898285820371


 60%|██████    | 36/60 [07:31<05:00, 12.52s/it]

0.9702428125199818


 62%|██████▏   | 37/60 [07:44<04:47, 12.52s/it]

0.9654769329797654


 63%|██████▎   | 38/60 [07:56<04:35, 12.51s/it]

0.9585083850792476


 65%|██████▌   | 39/60 [08:09<04:23, 12.53s/it]

0.9565660698073251


 67%|██████▋   | 40/60 [08:21<04:10, 12.54s/it]

0.9566146844909305


 68%|██████▊   | 41/60 [08:34<03:57, 12.53s/it]

0.9569431869756608


 70%|███████   | 42/60 [08:46<03:45, 12.53s/it]

0.9496218448593503


 72%|███████▏  | 43/60 [08:59<03:32, 12.53s/it]

0.9481523476895832


 73%|███████▎  | 44/60 [09:11<03:20, 12.53s/it]

0.9482155271938869


 75%|███████▌  | 45/60 [09:24<03:07, 12.53s/it]

0.942461005278996


 77%|███████▋  | 46/60 [09:36<02:55, 12.52s/it]

0.9462826010726747


 78%|███████▊  | 47/60 [09:49<02:42, 12.52s/it]

0.9427227661723182


 80%|████████  | 48/60 [10:01<02:30, 12.51s/it]

0.9500186684585753


 82%|████████▏ | 49/60 [10:14<02:17, 12.52s/it]

0.9423977023079282


 83%|████████▎ | 50/60 [10:26<02:05, 12.52s/it]

0.9416256646315256


 85%|████████▌ | 51/60 [10:39<01:52, 12.52s/it]

0.9440082027798608


 87%|████████▋ | 52/60 [10:51<01:40, 12.53s/it]

0.948592198746545


 88%|████████▊ | 53/60 [11:04<01:27, 12.54s/it]

0.951965209983644


 90%|█████████ | 54/60 [11:17<01:15, 12.55s/it]

0.9463102576278505


 92%|█████████▏| 55/60 [11:29<01:02, 12.54s/it]

0.9446165817124503


 93%|█████████▎| 56/60 [11:42<00:50, 12.54s/it]

0.9420983351412273


 95%|█████████▌| 57/60 [11:54<00:37, 12.54s/it]

0.933978449730646


 97%|█████████▋| 58/60 [12:07<00:25, 12.54s/it]

0.9265071252981821


 98%|█████████▊| 59/60 [12:19<00:12, 12.54s/it]

0.924352921190716


100%|██████████| 60/60 [12:32<00:00, 12.54s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.9206460004761106


  2%|▏         | 1/60 [00:12<12:24, 12.63s/it]

0.8858289406413123


  3%|▎         | 2/60 [00:25<12:11, 12.61s/it]

0.8610530013129825


  5%|▌         | 3/60 [00:37<11:57, 12.58s/it]

0.8531838740621295


  7%|▋         | 4/60 [00:50<11:45, 12.60s/it]

0.8448880740574428


  8%|▊         | 5/60 [01:02<11:33, 12.60s/it]

0.8414615719091325


 10%|█         | 6/60 [01:15<11:20, 12.60s/it]

0.8357898749056316


 12%|█▏        | 7/60 [01:28<11:08, 12.61s/it]

0.8325146720522926


 13%|█▎        | 8/60 [01:40<10:55, 12.60s/it]

0.8327181821777707


 15%|█▌        | 9/60 [01:53<10:42, 12.61s/it]

0.829515685637792


 17%|█▋        | 10/60 [02:05<10:29, 12.60s/it]

0.8269067633719671


 18%|█▊        | 11/60 [02:18<10:17, 12.61s/it]

0.8256266926016126


 20%|██        | 12/60 [02:31<10:05, 12.62s/it]

0.821827171813874


 22%|██▏       | 13/60 [02:43<09:53, 12.63s/it]

0.8222022411369142


 23%|██▎       | 14/60 [02:56<09:40, 12.63s/it]

0.8207952238264538


 25%|██▌       | 15/60 [03:09<09:27, 12.62s/it]

0.8196302765891665


 27%|██▋       | 16/60 [03:21<09:15, 12.63s/it]

0.819577815986815


 28%|██▊       | 17/60 [03:34<09:02, 12.62s/it]

0.8169416274343219


 30%|███       | 18/60 [03:47<08:50, 12.62s/it]

0.8160192696821122


 32%|███▏      | 19/60 [03:59<08:37, 12.62s/it]

0.8145713295255389


 33%|███▎      | 20/60 [04:12<08:25, 12.63s/it]

0.8135690206573123


 35%|███▌      | 21/60 [04:24<08:12, 12.63s/it]

0.8118762927395957


 37%|███▋      | 22/60 [04:37<07:59, 12.62s/it]

0.8118106595107487


 38%|███▊      | 23/60 [04:50<07:46, 12.61s/it]

0.810247065055938


 40%|████      | 24/60 [05:02<07:33, 12.60s/it]

0.8104720825240725


 42%|████▏     | 25/60 [05:15<07:21, 12.61s/it]

0.8093752577191308


 43%|████▎     | 26/60 [05:27<07:09, 12.63s/it]

0.8077478877135685


 45%|████▌     | 27/60 [05:40<06:56, 12.63s/it]

0.8074100925808861


 47%|████▋     | 28/60 [05:53<06:43, 12.62s/it]

0.8075669038863409


 48%|████▊     | 29/60 [06:05<06:31, 12.64s/it]

0.8059626562254769


 50%|█████     | 30/60 [06:18<06:19, 12.64s/it]

0.8052430919238499


 52%|█████▏    | 31/60 [06:31<06:06, 12.63s/it]

0.8044157908076331


 53%|█████▎    | 32/60 [06:43<05:53, 12.63s/it]

0.8048407137393951


 55%|█████▌    | 33/60 [06:56<05:41, 12.66s/it]

0.8022250064781734


 57%|█████▋    | 34/60 [07:09<05:28, 12.64s/it]

0.8024984271753401


 58%|█████▊    | 35/60 [07:21<05:15, 12.64s/it]

0.8009394180207026


 60%|██████    | 36/60 [07:34<05:03, 12.65s/it]

0.8011688817115057


 62%|██████▏   | 37/60 [07:47<04:51, 12.66s/it]

0.800724820012138


 63%|██████▎   | 38/60 [07:59<04:38, 12.66s/it]

0.7999868463902247


 65%|██████▌   | 39/60 [08:12<04:28, 12.79s/it]

0.79963580483482


 67%|██████▋   | 40/60 [08:25<04:15, 12.75s/it]

0.7986252563340324


 68%|██████▊   | 41/60 [08:38<04:01, 12.71s/it]

0.7975092544442132


 70%|███████   | 42/60 [08:50<03:48, 12.69s/it]

0.7965319695926848


 72%|███████▏  | 43/60 [09:03<03:35, 12.68s/it]

0.7970192233721415


 73%|███████▎  | 44/60 [09:15<03:22, 12.64s/it]

0.7971439333189101


 75%|███████▌  | 45/60 [09:28<03:10, 12.68s/it]

0.7951693648383731


 77%|███████▋  | 46/60 [09:41<02:57, 12.64s/it]

0.7946193871043977


 78%|███████▊  | 47/60 [09:53<02:44, 12.63s/it]

0.7945783464681535


 80%|████████  | 48/60 [10:06<02:31, 12.62s/it]

0.7940892350106012


 82%|████████▏ | 49/60 [10:19<02:18, 12.60s/it]

0.795435851528531


 83%|████████▎ | 50/60 [10:31<02:06, 12.61s/it]

0.79458078174364


 85%|████████▌ | 51/60 [10:44<01:53, 12.62s/it]

0.7935351048197065


 87%|████████▋ | 52/60 [10:56<01:40, 12.59s/it]

0.7927218618847075


 88%|████████▊ | 53/60 [11:09<01:28, 12.58s/it]

0.7927321380092984


 90%|█████████ | 54/60 [11:21<01:15, 12.56s/it]

0.7918172776699066


 92%|█████████▏| 55/60 [11:34<01:02, 12.55s/it]

0.7921817245937529


 93%|█████████▎| 56/60 [11:46<00:50, 12.54s/it]

0.7906341793991271


 95%|█████████▌| 57/60 [11:59<00:37, 12.53s/it]

0.7902341002509707


 97%|█████████▋| 58/60 [12:11<00:25, 12.52s/it]

0.7895029627141499


 98%|█████████▊| 59/60 [12:24<00:12, 12.53s/it]

0.7895291646321615


100%|██████████| 60/60 [12:36<00:00, 12.62s/it]

0.7887433497678666
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [38]:
model_score5

{'Bleu_1': 0.6035051833945143,
 'Bleu_2': 0.4749573110110273,
 'Bleu_3': 0.3941459799954521,
 'Bleu_4': 0.3362415713922167,
 'METEOR': 0.27340836080120834,
 'ROUGE_L': 0.5112500175434576,
 'CIDEr': 1.9049426172351707,
 'SPICE': 0.3601463937667336,
 'USC_similarity': 0.5774092229335572}

In [39]:
model_scores = defaultdict(list)
for scores in [model_score1, model_score2, model_score3, model_score4, model_score5]:
    for key, value in scores.items():
        model_scores[key].append(value)

In [40]:
model_scores

defaultdict(list,
            {'Bleu_1': [0.6433619393516047,
              0.6436207135915293,
              0.6312827815171983,
              0.6301631546644149,
              0.6035051833945143],
             'Bleu_2': [0.5124391566040026,
              0.5200530604186809,
              0.5041238851974881,
              0.5004495400753254,
              0.4749573110110273],
             'Bleu_3': [0.425767703524256,
              0.4380441113568216,
              0.42174541870958443,
              0.41538561758785875,
              0.3941459799954521],
             'Bleu_4': [0.36393501416121626,
              0.37843428605241924,
              0.3625381292115647,
              0.3536924159464762,
              0.3362415713922167],
             'METEOR': [0.29322260324713706,
              0.29974856806056854,
              0.2893236559943383,
              0.2882922885119187,
              0.27340836080120834],
             'ROUGE_L': [0.5408537882511261,
              0.5465454126

In [41]:
tag = '9.1.1'
with open(f'{root_captioning}/fz_notebooks/cv_n{tag}.json', 'w') as fp:
    json.dump(model_scores, fp)

In [41]:
train_index

array([ 2084,  2085,  2086, ..., 10413, 10414, 10415])

In [31]:
train_paths, test_paths = all_paths[:8332], all_paths[8332:]
train_descriptions, test_descriptions = all_descriptions[:8332], all_descriptions[8332:]
vocab = get_vocab(train_descriptions, word_count_threshold=10)
idxtoword, wordtoidx = get_word_dict(vocab)
vocab_size = get_vocab_size(idxtoword)
embedding_dim = 200
embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

train_img_features, test_img_features = get_train_test(encoder, train_paths, test_paths)

train_loader = get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
)


There are 41660 captions
preprocessed words 2704 ==> 901
The vocabulary size is 902.
814 out of 902 words are found in the pre-trained matrix.
The size of embedding_matrix is (902, 200)

Generating set took: 0:03:11.51

Generating set took: 0:00:48.08


In [32]:
caption_model = train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix
)

  2%|▏         | 1/60 [00:12<12:06, 12.32s/it]

5.042335720289321


  3%|▎         | 2/60 [00:24<11:53, 12.30s/it]

2.716521155266535


  5%|▌         | 3/60 [00:36<11:40, 12.29s/it]

1.9405687252680461


  7%|▋         | 4/60 [00:49<11:28, 12.29s/it]

1.6280740783328103


  8%|▊         | 5/60 [01:01<11:15, 12.28s/it]

1.4868501680237907


 10%|█         | 6/60 [01:13<11:03, 12.30s/it]

1.4017295837402344


 12%|█▏        | 7/60 [01:25<10:51, 12.28s/it]

1.344799177987235


 13%|█▎        | 8/60 [01:38<10:38, 12.28s/it]

1.3073496165729703


 15%|█▌        | 9/60 [01:50<10:27, 12.29s/it]

1.2670587244487943


 17%|█▋        | 10/60 [02:02<10:15, 12.31s/it]

1.2383008968262446


 18%|█▊        | 11/60 [02:15<10:02, 12.29s/it]

1.215726443699428


 20%|██        | 12/60 [02:27<09:50, 12.30s/it]

1.1909155277978807


 22%|██▏       | 13/60 [02:39<09:38, 12.30s/it]

1.1727187179383778


 23%|██▎       | 14/60 [02:52<09:26, 12.32s/it]

1.157722146738143


 25%|██▌       | 15/60 [03:04<09:14, 12.33s/it]

1.1440089572043646


 27%|██▋       | 16/60 [03:16<09:01, 12.32s/it]

1.134343574444453


 28%|██▊       | 17/60 [03:29<08:50, 12.33s/it]

1.1173948915231795


 30%|███       | 18/60 [03:41<08:38, 12.34s/it]

1.1015570263067882


 32%|███▏      | 19/60 [03:53<08:26, 12.35s/it]

1.0928500081811632


 33%|███▎      | 20/60 [04:06<08:14, 12.35s/it]

1.0839472171806155


 35%|███▌      | 21/60 [04:18<08:01, 12.35s/it]

1.07615613085883


 37%|███▋      | 22/60 [04:30<07:49, 12.34s/it]

1.0671265891620092


 38%|███▊      | 23/60 [04:43<07:36, 12.35s/it]

1.062000572681427


 40%|████      | 24/60 [04:55<07:26, 12.39s/it]

1.056591849951517


 42%|████▏     | 25/60 [05:08<07:13, 12.39s/it]

1.0506715022382282


 43%|████▎     | 26/60 [05:20<07:01, 12.40s/it]

1.044960244780495


 45%|████▌     | 27/60 [05:33<06:49, 12.41s/it]

1.037071848199481


 47%|████▋     | 28/60 [05:45<06:37, 12.41s/it]

1.031579981247584


 48%|████▊     | 29/60 [05:57<06:24, 12.42s/it]

1.0258051653703053


 50%|█████     | 30/60 [06:10<06:11, 12.40s/it]

1.0200702093896412


 52%|█████▏    | 31/60 [06:22<05:59, 12.39s/it]

1.0167458880515325


 53%|█████▎    | 32/60 [06:34<05:46, 12.39s/it]

1.0202275897775377


 55%|█████▌    | 33/60 [06:47<05:34, 12.38s/it]

1.015879118726367


 57%|█████▋    | 34/60 [06:59<05:22, 12.39s/it]

1.0076816635472434


 58%|█████▊    | 35/60 [07:12<05:09, 12.38s/it]

0.9982786717868987


 60%|██████    | 36/60 [07:24<04:56, 12.37s/it]

0.9918509111517951


 62%|██████▏   | 37/60 [07:36<04:44, 12.36s/it]

0.9880256808939434


 63%|██████▎   | 38/60 [07:49<04:31, 12.36s/it]

0.9884116081964403


 65%|██████▌   | 39/60 [08:01<04:19, 12.34s/it]

0.9876186563855126


 67%|██████▋   | 40/60 [08:13<04:06, 12.33s/it]

0.9840199564184461


 68%|██████▊   | 41/60 [08:26<03:54, 12.33s/it]

0.9807431896527609


 70%|███████   | 42/60 [08:38<03:41, 12.31s/it]

0.980425748087111


 72%|███████▏  | 43/60 [08:50<03:29, 12.30s/it]

0.9777440371967497


 73%|███████▎  | 44/60 [09:03<03:17, 12.34s/it]

0.9693419266314733


 75%|███████▌  | 45/60 [09:15<03:05, 12.34s/it]

0.9693884466375623


 77%|███████▋  | 46/60 [09:27<02:52, 12.35s/it]

0.9624513898577008


 78%|███████▊  | 47/60 [09:40<02:40, 12.34s/it]

0.96063913050152


 80%|████████  | 48/60 [09:52<02:28, 12.34s/it]

0.9609340386731284


 82%|████████▏ | 49/60 [10:04<02:15, 12.32s/it]

0.9602608538809276


 83%|████████▎ | 50/60 [10:16<02:03, 12.30s/it]

0.9597069904917762


 85%|████████▌ | 51/60 [10:29<01:50, 12.29s/it]

0.9538493440264747


 87%|████████▋ | 52/60 [10:41<01:38, 12.27s/it]

0.9516833978039878


 88%|████████▊ | 53/60 [10:53<01:25, 12.28s/it]

0.9451062764440264


 90%|█████████ | 54/60 [11:06<01:13, 12.28s/it]

0.941167645511173


 92%|█████████▏| 55/60 [11:18<01:01, 12.28s/it]

0.9400020965508052


 93%|█████████▎| 56/60 [11:30<00:49, 12.29s/it]

0.9366258396988824


 95%|█████████▌| 57/60 [11:42<00:36, 12.28s/it]

0.9339136708350408


 97%|█████████▋| 58/60 [11:55<00:24, 12.29s/it]

0.9317708143166133


 98%|█████████▊| 59/60 [12:07<00:12, 12.30s/it]

0.9308102599212101


100%|██████████| 60/60 [12:19<00:00, 12.33s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

0.9308774599007198


  2%|▏         | 1/60 [00:12<12:04, 12.29s/it]

0.9019688268502554


  3%|▎         | 2/60 [00:24<11:53, 12.31s/it]

0.8781182368596395


  5%|▌         | 3/60 [00:36<11:42, 12.32s/it]

0.8702043678079333


  7%|▋         | 4/60 [00:49<11:30, 12.33s/it]

0.8643359456743512


  8%|▊         | 5/60 [01:01<11:19, 12.35s/it]

0.8597883340858278


 10%|█         | 6/60 [01:14<11:06, 12.34s/it]

0.8549575975963047


 12%|█▏        | 7/60 [01:26<10:54, 12.35s/it]

0.8515847708497729


 13%|█▎        | 8/60 [01:38<10:42, 12.35s/it]

0.8493059887772515


 15%|█▌        | 9/60 [01:51<10:31, 12.38s/it]

0.8471084747995649


 17%|█▋        | 10/60 [02:03<10:19, 12.40s/it]

0.8451624072733379


 18%|█▊        | 11/60 [02:16<10:08, 12.41s/it]

0.844211588303248


 20%|██        | 12/60 [02:28<09:56, 12.42s/it]

0.8417508786632901


 22%|██▏       | 13/60 [02:41<09:45, 12.45s/it]

0.8408075854891822


 23%|██▎       | 14/60 [02:53<09:31, 12.43s/it]

0.83997120034127


 25%|██▌       | 15/60 [03:05<09:18, 12.41s/it]

0.8372574902716137


 27%|██▋       | 16/60 [03:18<09:05, 12.40s/it]

0.8375340132486253


 28%|██▊       | 17/60 [03:30<08:52, 12.39s/it]

0.8345083196957906


 30%|███       | 18/60 [03:42<08:40, 12.39s/it]

0.8333869079748789


 32%|███▏      | 19/60 [03:55<08:28, 12.41s/it]

0.8334306350776127


 33%|███▎      | 20/60 [04:07<08:16, 12.42s/it]

0.8326169365928286


 35%|███▌      | 21/60 [04:20<08:04, 12.42s/it]

0.8313993158794585


 37%|███▋      | 22/60 [04:32<07:50, 12.38s/it]

0.8315674832889012


 38%|███▊      | 23/60 [04:44<07:37, 12.37s/it]

0.8298902979918888


 40%|████      | 24/60 [04:57<07:25, 12.39s/it]

0.8290105646564847


 42%|████▏     | 25/60 [05:09<07:13, 12.39s/it]

0.8277963485036578


 43%|████▎     | 26/60 [05:22<07:01, 12.40s/it]

0.8279832757654644


 45%|████▌     | 27/60 [05:34<06:49, 12.40s/it]

0.8265467910539537


 47%|████▋     | 28/60 [05:46<06:36, 12.38s/it]

0.8256107937721979


 48%|████▊     | 29/60 [05:59<06:24, 12.40s/it]

0.8245173054082053


 50%|█████     | 30/60 [06:11<06:12, 12.40s/it]

0.8253041520005181


 52%|█████▏    | 31/60 [06:24<05:59, 12.40s/it]

0.8237993731385186


 53%|█████▎    | 32/60 [06:36<05:47, 12.40s/it]

0.8229540345214662


 55%|█████▌    | 33/60 [06:48<05:34, 12.40s/it]

0.8226570089658102


 57%|█████▋    | 34/60 [07:01<05:21, 12.38s/it]

0.8215971432981037


 58%|█████▊    | 35/60 [07:13<05:08, 12.35s/it]

0.8206390752678826


 60%|██████    | 36/60 [07:25<04:55, 12.33s/it]

0.8208597983632769


 62%|██████▏   | 37/60 [07:38<04:43, 12.31s/it]

0.8185488496507917


 63%|██████▎   | 38/60 [07:50<04:30, 12.30s/it]

0.8178979002294087


 65%|██████▌   | 39/60 [08:02<04:18, 12.32s/it]

0.819680582909357


 67%|██████▋   | 40/60 [08:15<04:06, 12.32s/it]

0.8188874593802861


 68%|██████▊   | 41/60 [08:27<03:54, 12.33s/it]

0.8181406387260982


 70%|███████   | 42/60 [08:39<03:41, 12.32s/it]

0.818142450991131


 72%|███████▏  | 43/60 [08:52<03:29, 12.33s/it]

0.8173704331829434


 73%|███████▎  | 44/60 [09:04<03:17, 12.33s/it]

0.8163428178855351


 75%|███████▌  | 45/60 [09:16<03:05, 12.37s/it]

0.816184903894152


 77%|███████▋  | 46/60 [09:29<02:53, 12.38s/it]

0.8143918287186396


 78%|███████▊  | 47/60 [09:41<02:40, 12.38s/it]

0.8145896806603387


 80%|████████  | 48/60 [09:54<02:28, 12.38s/it]

0.8137880095413753


 82%|████████▏ | 49/60 [10:06<02:16, 12.39s/it]

0.8146908339999971


 83%|████████▎ | 50/60 [10:18<02:03, 12.36s/it]

0.8137789354437873


 85%|████████▌ | 51/60 [10:31<01:51, 12.38s/it]

0.8123417922428676


 87%|████████▋ | 52/60 [10:43<01:38, 12.37s/it]

0.8141340088276636


 88%|████████▊ | 53/60 [10:55<01:26, 12.39s/it]

0.81120517991838


 90%|█████████ | 54/60 [11:08<01:14, 12.39s/it]

0.8115059321834928


 92%|█████████▏| 55/60 [11:20<01:01, 12.38s/it]

0.8119404330140069


 93%|█████████▎| 56/60 [11:32<00:49, 12.35s/it]

0.811742195061275


 95%|█████████▌| 57/60 [11:45<00:36, 12.33s/it]

0.8095639631861732


 97%|█████████▋| 58/60 [11:57<00:24, 12.34s/it]

0.8095434875715346


 98%|█████████▊| 59/60 [12:09<00:12, 12.33s/it]

0.8095497531550271


100%|██████████| 60/60 [12:22<00:00, 12.37s/it]

0.8088172078132629





In [44]:
model_score = evaluate_results(
    test_img_features, 
    caption_model,
    captions[8332:],
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
)


Generating captions...
tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [45]:
results = defaultdict(list)
for key, value in model_score.items():
    results[key].append(value)

In [36]:
results

defaultdict(list,
            {'Bleu_1': [0.5243844696969449],
             'Bleu_2': [0.36773382448793585],
             'Bleu_3': [0.28169001186113957],
             'Bleu_4': [0.2252378577258682],
             'METEOR': [0.21494328519587247],
             'ROUGE_L': [0.4091754918572505],
             'CIDEr': [1.0331197511250043],
             'SPICE': [0.2592197534586306],
             'USC_similarity': [0.4807721203469248]})

In [46]:
results

defaultdict(list,
            {'Bleu_1': [0.5557901680271453],
             'Bleu_2': [0.4472982119806543],
             'Bleu_3': [0.3772014078949239],
             'Bleu_4': [0.32655692714139967],
             'METEOR': [0.23561333021819827],
             'ROUGE_L': [0.4829312753952617],
             'CIDEr': [1.8552872986743107],
             'SPICE': [0.24283571565742926],
             'USC_similarity': [0.5491626800486601]})