## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../s3"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)



In [7]:
all_paths = train_paths.copy()
all_paths.extend(test_paths.copy())
all_paths = np.array(all_paths)

all_descriptions = train_descriptions.copy()
all_descriptions.extend(test_descriptions.copy())
all_descriptions = np.array(all_descriptions)

captions = all_descriptions.copy()
max_length_all = max(max_length_train, max_length_test)
max_length = max_length_all + 2
      
lex = set()
for sen in all_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in all_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(all_paths)} images') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')


There are 10416 images
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.


In [8]:
all_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
all_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Glove Embeddings

In [10]:
embeddings_index = {} 
path = os.path.join(root_captioning, 'glove.6B.200d.txt') if AWS\
else os.path.join(root_captioning, 'raw', 'glove.6B.200d.txt')

f = open(
    path, 
    encoding="utf-8"
)

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:22, 17547.21it/s]

Found 400000 word vectors.





In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count = 0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
class CNNModel(nn.Module):

    def __init__(self, cnn_type, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        if cnn_type == 'vgg16':
            self.model = models.vgg16(pretrained=pretrained)

            # remove the last two layers in classifier
            self.model.classifier = nn.Sequential(
              *list(self.model.classifier.children())[:-2]
            )
            self.input_size = 224     

        # inception v3 expects (299, 299) sized images
        elif cnn_type == 'inception_v3':
            self.model = models.inception_v3(pretrained=pretrained)
            # remove the classification layer
            self.model.fc = nn.Identity()

            # turn off auxiliary output
            self.model.aux_logits = False
            self.input_size = 299

        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
            # set the model to evaluation model
            self.model.eval()

        return self.model(img_input)

In [14]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
              'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [15]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

### Train the Neural Network

In [16]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        )
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [17]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img, in_seq


In [18]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [19]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [20]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in img_paths:
        img_features.append(
            encode_image(model, image_path).cpu().data.numpy()
    )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [21]:
def get_train_test(
    encoder,
    train_paths,
    test_paths,
    cnn_type='inception_v3',
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    return train_img_features, test_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    cnn_type='inception_v3',
    hidden_size=256,
):

    caption_model = CaptionModel(
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=hidden_size,
        embedding_matrix=embedding_matrix, 
        embedding_train=False
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 6)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [22]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat = model(
            torch.FloatTensor(img_features)\
            .view(-1, model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [23]:
sys.path.append('../scr/evaluation/')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [24]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [None]:
cnn_type = 'inception_v3'
encoder = CNNModel(cnn_type, pretrained=True)
encoder.to(device)

In [26]:
def cross_validation(train_index, test_index, count):
    print('=' * 60)
    print(f'Split {count}:')
    print(f'Splitting data...')
    
    train_paths, test_paths = all_paths[train_index], all_paths[test_index]
    train_descriptions, test_descriptions = all_descriptions[train_index], all_descriptions[test_index]
    print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')
    
    vocab = get_vocab(train_descriptions, word_count_threshold=10)
    idxtoword, wordtoidx = get_word_dict(vocab)
    vocab_size = get_vocab_size(idxtoword)
    embedding_dim = 200
    embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

    print(f'Preparing dataloader...')
    train_img_features, test_img_features = get_train_test(encoder, train_paths, test_paths)

    train_loader = get_train_dataloader(
        train_descriptions, 
        train_img_features,
        wordtoidx,
        max_length,
        batch_size=1000
    )

    print(f'Training...')
    caption_model = train_model(
        train_loader,
        vocab_size,
        embedding_dim, 
        embedding_matrix
    )

    
    ref = captions[test_index]
    model_score = evaluate_results(
        test_img_features, 
        caption_model,
        ref,
        max_length,
        vocab_size,
        wordtoidx,
        idxtoword
    )
    
    return caption_model, model_score

In [27]:
cv = KFold(n_splits=5)
cv = [(train_index, test_index) for train_index, test_index in cv.split(all_paths)]  

In [28]:
caption_model1, model_score1 = cross_validation(cv[0][0], cv[0][1], 1)    

Split 1:
Splitting data...
8332 images for training and 2084 images for testing.
There are 41660 captions
preprocessed words 2677 ==> 890
The vocabulary size is 891.
802 out of 891 words are found in the pre-trained matrix.
The size of embedding_matrix is (891, 200)
Preparing dataloader...

Generating set took: 0:03:46.08


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:56.33
Training...


  2%|▏         | 1/60 [00:00<00:43,  1.36it/s]

6.832412666744656


  3%|▎         | 2/60 [00:01<00:42,  1.37it/s]

5.045856952667236


  5%|▌         | 3/60 [00:02<00:41,  1.39it/s]

4.665201716952854


  7%|▋         | 4/60 [00:02<00:40,  1.40it/s]

4.4976259337531195


  8%|▊         | 5/60 [00:03<00:38,  1.42it/s]

4.366277323828803


 10%|█         | 6/60 [00:04<00:38,  1.40it/s]

4.203340848286946


 12%|█▏        | 7/60 [00:04<00:37,  1.42it/s]

3.984089348051283


 13%|█▎        | 8/60 [00:05<00:36,  1.42it/s]

3.735374371210734


 15%|█▌        | 9/60 [00:06<00:35,  1.42it/s]

3.4494495656755237


 17%|█▋        | 10/60 [00:07<00:34,  1.43it/s]

3.16546450720893


 18%|█▊        | 11/60 [00:07<00:34,  1.42it/s]

2.9739466773139105


 20%|██        | 12/60 [00:08<00:33,  1.42it/s]

2.844272428088718


 22%|██▏       | 13/60 [00:09<00:33,  1.42it/s]

2.7501567469702826


 23%|██▎       | 14/60 [00:09<00:32,  1.43it/s]

2.679328759511312


 25%|██▌       | 15/60 [00:10<00:31,  1.41it/s]

2.6192585362328424


 27%|██▋       | 16/60 [00:11<00:31,  1.40it/s]

2.572774330774943


 28%|██▊       | 17/60 [00:12<00:30,  1.40it/s]

2.5418820116255016


 30%|███       | 18/60 [00:12<00:29,  1.40it/s]

2.488095839818319


 32%|███▏      | 19/60 [00:13<00:29,  1.40it/s]

2.453247308731079


 33%|███▎      | 20/60 [00:14<00:28,  1.40it/s]

2.423310544755724


 35%|███▌      | 21/60 [00:14<00:27,  1.41it/s]

2.400469700495402


 37%|███▋      | 22/60 [00:15<00:26,  1.43it/s]

2.3687525855170355


 38%|███▊      | 23/60 [00:16<00:26,  1.39it/s]

2.349749114778307


 40%|████      | 24/60 [00:16<00:25,  1.40it/s]

2.323310719596015


 42%|████▏     | 25/60 [00:17<00:25,  1.40it/s]

2.2969721688164606


 43%|████▎     | 26/60 [00:18<00:24,  1.39it/s]

2.2721397082010903


 45%|████▌     | 27/60 [00:19<00:23,  1.39it/s]

2.252852095497979


 47%|████▋     | 28/60 [00:20<00:25,  1.28it/s]

2.2403679158952503


 48%|████▊     | 29/60 [00:20<00:23,  1.30it/s]

2.213851081000434


 50%|█████     | 30/60 [00:21<00:22,  1.31it/s]

2.2014306121402316


 52%|█████▏    | 31/60 [00:22<00:21,  1.34it/s]

2.1815412971708508


 53%|█████▎    | 32/60 [00:23<00:20,  1.36it/s]

2.164588729540507


 55%|█████▌    | 33/60 [00:23<00:19,  1.36it/s]

2.151034196217855


 57%|█████▋    | 34/60 [00:24<00:18,  1.39it/s]

2.137797792752584


 58%|█████▊    | 35/60 [00:25<00:17,  1.41it/s]

2.122876617643568


 60%|██████    | 36/60 [00:25<00:16,  1.41it/s]

2.1113280985090466


 62%|██████▏   | 37/60 [00:26<00:16,  1.43it/s]

2.102520373132494


 63%|██████▎   | 38/60 [00:27<00:15,  1.44it/s]

2.0896215306388006


 65%|██████▌   | 39/60 [00:27<00:14,  1.42it/s]

2.0862324237823486


 67%|██████▋   | 40/60 [00:28<00:14,  1.42it/s]

2.071008231904772


 68%|██████▊   | 41/60 [00:29<00:13,  1.43it/s]

2.057939304245843


 70%|███████   | 42/60 [00:30<00:12,  1.40it/s]

2.0403859350416393


 72%|███████▏  | 43/60 [00:30<00:12,  1.40it/s]

2.030710140864054


 73%|███████▎  | 44/60 [00:31<00:11,  1.42it/s]

2.0291981961992054


 75%|███████▌  | 45/60 [00:32<00:10,  1.41it/s]

2.009916557206048


 77%|███████▋  | 46/60 [00:32<00:09,  1.41it/s]

2.0106710460450916


 78%|███████▊  | 47/60 [00:33<00:09,  1.41it/s]

2.0070531765619912


 80%|████████  | 48/60 [00:34<00:08,  1.42it/s]

1.997168673409356


 82%|████████▏ | 49/60 [00:34<00:07,  1.42it/s]

1.9707506630155776


 83%|████████▎ | 50/60 [00:35<00:07,  1.41it/s]

1.9594879945119221


 85%|████████▌ | 51/60 [00:36<00:06,  1.42it/s]

1.9589494864145915


 87%|████████▋ | 52/60 [00:37<00:05,  1.41it/s]

1.9556055201424494


 88%|████████▊ | 53/60 [00:37<00:04,  1.42it/s]

1.9538569847742717


 90%|█████████ | 54/60 [00:38<00:04,  1.41it/s]

1.935389518737793


 92%|█████████▏| 55/60 [00:39<00:03,  1.40it/s]

1.9405285252465143


 93%|█████████▎| 56/60 [00:40<00:03,  1.30it/s]

1.9237134721544054


 95%|█████████▌| 57/60 [00:40<00:02,  1.33it/s]

1.9098552862803142


 97%|█████████▋| 58/60 [00:41<00:01,  1.35it/s]

1.8922153181499906


 98%|█████████▊| 59/60 [00:42<00:00,  1.36it/s]

1.8696208794911702


100%|██████████| 60/60 [00:43<00:00,  1.40it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

1.8517011139127943


  2%|▏         | 1/60 [00:00<00:40,  1.47it/s]

1.8120644489924114


  3%|▎         | 2/60 [00:01<00:39,  1.45it/s]

1.8005098236931696


  5%|▌         | 3/60 [00:02<00:39,  1.46it/s]

1.7886999977959528


  7%|▋         | 4/60 [00:02<00:38,  1.46it/s]

1.7807283798853557


  8%|▊         | 5/60 [00:03<00:38,  1.44it/s]

1.7766426934136286


 10%|█         | 6/60 [00:04<00:37,  1.43it/s]

1.7725627952151828


 12%|█▏        | 7/60 [00:04<00:37,  1.42it/s]

1.7725704113642375


 13%|█▎        | 8/60 [00:05<00:36,  1.41it/s]

1.7719478607177734


 15%|█▌        | 9/60 [00:06<00:36,  1.40it/s]

1.764702558517456


 17%|█▋        | 10/60 [00:07<00:35,  1.42it/s]

1.7635152339935303


 18%|█▊        | 11/60 [00:07<00:34,  1.42it/s]

1.762580884827508


 20%|██        | 12/60 [00:08<00:33,  1.43it/s]

1.762547042634752


 22%|██▏       | 13/60 [00:09<00:34,  1.38it/s]

1.7621402475568984


 23%|██▎       | 14/60 [00:09<00:33,  1.37it/s]

1.7586484750111897


 25%|██▌       | 15/60 [00:10<00:32,  1.37it/s]

1.7559188471900091


 27%|██▋       | 16/60 [00:11<00:31,  1.38it/s]

1.7574272155761719


 28%|██▊       | 17/60 [00:12<00:30,  1.39it/s]

1.7580433421664767


 30%|███       | 18/60 [00:12<00:30,  1.39it/s]

1.7523724105623033


 32%|███▏      | 19/60 [00:13<00:31,  1.29it/s]

1.7532278166876898


 33%|███▎      | 20/60 [00:14<00:30,  1.31it/s]

1.7555302778879802


 35%|███▌      | 21/60 [00:15<00:29,  1.33it/s]

1.754157198799981


 37%|███▋      | 22/60 [00:15<00:28,  1.34it/s]

1.7509904040230646


 38%|███▊      | 23/60 [00:16<00:27,  1.34it/s]

1.7473224798838298


 40%|████      | 24/60 [00:17<00:26,  1.36it/s]

1.7514598369598389


 42%|████▏     | 25/60 [00:18<00:25,  1.37it/s]

1.7500399218665228


 43%|████▎     | 26/60 [00:18<00:24,  1.36it/s]

1.7496010727352567


 45%|████▌     | 27/60 [00:19<00:24,  1.37it/s]

1.7464117606480916


 47%|████▋     | 28/60 [00:20<00:23,  1.38it/s]

1.7474920882119074


 48%|████▊     | 29/60 [00:20<00:22,  1.40it/s]

1.7457094060050116


 50%|█████     | 30/60 [00:21<00:21,  1.40it/s]

1.7482218345006306


 52%|█████▏    | 31/60 [00:22<00:20,  1.42it/s]

1.7447067101796467


 53%|█████▎    | 32/60 [00:23<00:19,  1.44it/s]

1.7465289698706732


 55%|█████▌    | 33/60 [00:23<00:18,  1.43it/s]

1.7422371837827895


 57%|█████▋    | 34/60 [00:24<00:18,  1.42it/s]

1.7431287633048163


 58%|█████▊    | 35/60 [00:25<00:17,  1.43it/s]

1.744883139928182


 60%|██████    | 36/60 [00:25<00:16,  1.44it/s]

1.7446642716725667


 62%|██████▏   | 37/60 [00:26<00:16,  1.43it/s]

1.7424198521508112


 63%|██████▎   | 38/60 [00:27<00:15,  1.44it/s]

1.7429620689815946


 65%|██████▌   | 39/60 [00:27<00:14,  1.45it/s]

1.7400247388415866


 67%|██████▋   | 40/60 [00:28<00:14,  1.41it/s]

1.7405037350124783


 68%|██████▊   | 41/60 [00:29<00:13,  1.40it/s]

1.7396146059036255


 70%|███████   | 42/60 [00:30<00:12,  1.41it/s]

1.7399814393785265


 72%|███████▏  | 43/60 [00:30<00:12,  1.40it/s]

1.7343065738677979


 73%|███████▎  | 44/60 [00:31<00:11,  1.40it/s]

1.739209360546536


 75%|███████▌  | 45/60 [00:32<00:10,  1.41it/s]

1.7385296159320407


 77%|███████▋  | 46/60 [00:32<00:09,  1.40it/s]

1.7365938027699788


 78%|███████▊  | 47/60 [00:33<00:09,  1.42it/s]

1.733570721414354


 80%|████████  | 48/60 [00:34<00:08,  1.43it/s]

1.736678123474121


 82%|████████▏ | 49/60 [00:35<00:07,  1.42it/s]

1.736898316277398


 83%|████████▎ | 50/60 [00:35<00:07,  1.41it/s]

1.734258082177904


 85%|████████▌ | 51/60 [00:36<00:06,  1.39it/s]

1.7308242453469171


 87%|████████▋ | 52/60 [00:37<00:05,  1.38it/s]

1.735577556822035


 88%|████████▊ | 53/60 [00:37<00:05,  1.38it/s]

1.735016730096605


 90%|█████████ | 54/60 [00:38<00:04,  1.40it/s]

1.7317974832322862


 92%|█████████▏| 55/60 [00:39<00:03,  1.39it/s]

1.7340356641345553


 93%|█████████▎| 56/60 [00:40<00:02,  1.39it/s]

1.7331297000249226


 95%|█████████▌| 57/60 [00:40<00:02,  1.39it/s]

1.7308407492107816


 97%|█████████▋| 58/60 [00:41<00:01,  1.39it/s]

1.7323204941219754


 98%|█████████▊| 59/60 [00:42<00:00,  1.39it/s]

1.7300310267342462


100%|██████████| 60/60 [00:42<00:00,  1.40it/s]

1.7302672730551825
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [29]:
model_score1

{'Bleu_1': 0.44370052024426854,
 'Bleu_2': 0.27169985240406924,
 'Bleu_3': 0.18494146269162032,
 'Bleu_4': 0.13498287773923187,
 'METEOR': 0.167418534027843,
 'ROUGE_L': 0.3427375729530091,
 'CIDEr': 0.5061170014989845,
 'SPICE': 0.1683808590775102,
 'USC_similarity': 0.40798978003174524}

In [30]:
caption_model2, model_score2 = cross_validation(cv[1][0], cv[1][1], 2)    

Split 2:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2691 ==> 898
The vocabulary size is 899.
808 out of 899 words are found in the pre-trained matrix.
The size of embedding_matrix is (899, 200)
Preparing dataloader...

Generating set took: 0:03:51.33


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:57.23
Training...


  2%|▏         | 1/60 [00:00<00:42,  1.39it/s]

5.645769543117947


  3%|▎         | 2/60 [00:01<00:41,  1.38it/s]

4.577365398406982


  5%|▌         | 3/60 [00:02<00:40,  1.40it/s]

4.231279797024197


  7%|▋         | 4/60 [00:02<00:40,  1.40it/s]

3.8304900328318277


  8%|▊         | 5/60 [00:03<00:39,  1.39it/s]

3.4063424269358316


 10%|█         | 6/60 [00:04<00:39,  1.38it/s]

2.982609272003174


 12%|█▏        | 7/60 [00:05<00:38,  1.38it/s]

2.656289021174113


 13%|█▎        | 8/60 [00:05<00:37,  1.37it/s]

2.432561159133911


 15%|█▌        | 9/60 [00:06<00:36,  1.38it/s]

2.2456947565078735


 17%|█▋        | 10/60 [00:07<00:36,  1.38it/s]

2.0732989973492093


 18%|█▊        | 11/60 [00:07<00:35,  1.36it/s]

1.9347041580412123


 20%|██        | 12/60 [00:08<00:35,  1.37it/s]

1.8248562415440877


 22%|██▏       | 13/60 [00:09<00:34,  1.36it/s]

1.729492677582635


 23%|██▎       | 14/60 [00:10<00:33,  1.37it/s]

1.6577041943868


 25%|██▌       | 15/60 [00:10<00:33,  1.35it/s]

1.5897743834389582


 27%|██▋       | 16/60 [00:11<00:31,  1.38it/s]

1.5513840383953519


 28%|██▊       | 17/60 [00:12<00:30,  1.39it/s]

1.4897398286395602


 30%|███       | 18/60 [00:13<00:30,  1.38it/s]

1.4420792526668973


 32%|███▏      | 19/60 [00:13<00:29,  1.37it/s]

1.3962931897905138


 33%|███▎      | 20/60 [00:14<00:29,  1.36it/s]

1.338678088453081


 35%|███▌      | 21/60 [00:15<00:28,  1.36it/s]

1.29021022717158


 37%|███▋      | 22/60 [00:16<00:27,  1.36it/s]

1.250702612929874


 38%|███▊      | 23/60 [00:16<00:26,  1.38it/s]

1.212013218137953


 40%|████      | 24/60 [00:17<00:26,  1.38it/s]

1.1803772913085089


 42%|████▏     | 25/60 [00:18<00:25,  1.39it/s]

1.1567895081308153


 43%|████▎     | 26/60 [00:18<00:24,  1.39it/s]

1.135439587963952


 45%|████▌     | 27/60 [00:19<00:24,  1.37it/s]

1.120755586359236


 47%|████▋     | 28/60 [00:20<00:23,  1.37it/s]

1.1017268498738606


 48%|████▊     | 29/60 [00:21<00:22,  1.36it/s]

1.0739420851071675


 50%|█████     | 30/60 [00:21<00:22,  1.36it/s]

1.0529018375608656


 52%|█████▏    | 31/60 [00:22<00:21,  1.36it/s]

1.034998145368364


 53%|█████▎    | 32/60 [00:23<00:20,  1.36it/s]

1.0307372676001654


 55%|█████▌    | 33/60 [00:24<00:19,  1.38it/s]

1.0077004697587755


 57%|█████▋    | 34/60 [00:24<00:18,  1.40it/s]

0.9801545408036973


 58%|█████▊    | 35/60 [00:25<00:17,  1.41it/s]

0.9644158946143256


 60%|██████    | 36/60 [00:26<00:18,  1.27it/s]

0.9396494560771518


 62%|██████▏   | 37/60 [00:27<00:17,  1.30it/s]

0.9224826825989617


 63%|██████▎   | 38/60 [00:27<00:16,  1.32it/s]

0.9236365755399069


 65%|██████▌   | 39/60 [00:28<00:15,  1.33it/s]

0.9291349781884087


 67%|██████▋   | 40/60 [00:29<00:14,  1.34it/s]

0.916340708732605


 68%|██████▊   | 41/60 [00:30<00:13,  1.36it/s]

0.9004723164770339


 70%|███████   | 42/60 [00:30<00:13,  1.37it/s]

0.8814195858107673


 72%|███████▏  | 43/60 [00:31<00:12,  1.36it/s]

0.8821589218245612


 73%|███████▎  | 44/60 [00:32<00:11,  1.35it/s]

0.8783423635694716


 75%|███████▌  | 45/60 [00:32<00:10,  1.37it/s]

0.880482965045505


 77%|███████▋  | 46/60 [00:33<00:10,  1.39it/s]

0.8714231650034586


 78%|███████▊  | 47/60 [00:34<00:09,  1.37it/s]

0.8548703127437167


 80%|████████  | 48/60 [00:35<00:08,  1.38it/s]

0.8378965689076318


 82%|████████▏ | 49/60 [00:35<00:08,  1.37it/s]

0.8309518727991316


 83%|████████▎ | 50/60 [00:36<00:07,  1.36it/s]

0.8155329293674893


 85%|████████▌ | 51/60 [00:37<00:06,  1.35it/s]

0.8096889555454254


 87%|████████▋ | 52/60 [00:38<00:05,  1.34it/s]

0.8003793259461721


 88%|████████▊ | 53/60 [00:38<00:05,  1.34it/s]

0.7858746151129404


 90%|█████████ | 54/60 [00:39<00:04,  1.35it/s]

0.7840284407138824


 92%|█████████▏| 55/60 [00:40<00:03,  1.34it/s]

0.7889034052689871


 93%|█████████▎| 56/60 [00:41<00:02,  1.34it/s]

0.7992852562003665


 95%|█████████▌| 57/60 [00:41<00:02,  1.35it/s]

0.7872316208150651


 97%|█████████▋| 58/60 [00:42<00:01,  1.36it/s]

0.776119546757804


 98%|█████████▊| 59/60 [00:43<00:00,  1.37it/s]

0.7522256010108523


100%|██████████| 60/60 [00:44<00:00,  1.36it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

0.72407665848732


  2%|▏         | 1/60 [00:00<00:41,  1.42it/s]

0.7029354804091983


  3%|▎         | 2/60 [00:01<00:40,  1.42it/s]

0.688002735376358


  5%|▌         | 3/60 [00:02<00:40,  1.40it/s]

0.6699600650204552


  7%|▋         | 4/60 [00:02<00:39,  1.41it/s]

0.6580461661020914


  8%|▊         | 5/60 [00:03<00:39,  1.40it/s]

0.6510186460283067


 10%|█         | 6/60 [00:04<00:38,  1.40it/s]

0.6471731430954404


 12%|█▏        | 7/60 [00:05<00:39,  1.35it/s]

0.6378854678736793


 13%|█▎        | 8/60 [00:05<00:38,  1.35it/s]

0.6396526197592417


 15%|█▌        | 9/60 [00:06<00:37,  1.36it/s]

0.6358244982030656


 17%|█▋        | 10/60 [00:07<00:37,  1.35it/s]

0.6319818662272559


 18%|█▊        | 11/60 [00:08<00:35,  1.36it/s]

0.6347409652339088


 20%|██        | 12/60 [00:08<00:35,  1.37it/s]

0.6355269683731927


 22%|██▏       | 13/60 [00:09<00:34,  1.36it/s]

0.6322841081354353


 23%|██▎       | 14/60 [00:10<00:34,  1.33it/s]

0.6293753484884897


 25%|██▌       | 15/60 [00:11<00:33,  1.33it/s]

0.6291298502021365


 27%|██▋       | 16/60 [00:11<00:33,  1.33it/s]

0.6303117242124345


 28%|██▊       | 17/60 [00:12<00:32,  1.33it/s]

0.6307353807820214


 30%|███       | 18/60 [00:13<00:31,  1.32it/s]

0.6240404380692376


 32%|███▏      | 19/60 [00:14<00:33,  1.21it/s]

0.6232460472318861


 33%|███▎      | 20/60 [00:15<00:32,  1.24it/s]

0.626682225200865


 35%|███▌      | 21/60 [00:15<00:30,  1.27it/s]

0.6274046434296502


 37%|███▋      | 22/60 [00:16<00:29,  1.30it/s]

0.621992439031601


 38%|███▊      | 23/60 [00:17<00:28,  1.31it/s]

0.6220714184972975


 40%|████      | 24/60 [00:18<00:27,  1.32it/s]

0.6213991641998291


 42%|████▏     | 25/60 [00:18<00:26,  1.33it/s]

0.6202345258659787


 43%|████▎     | 26/60 [00:19<00:25,  1.33it/s]

0.6175257894727919


 45%|████▌     | 27/60 [00:20<00:24,  1.33it/s]

0.6184669633706411


 47%|████▋     | 28/60 [00:20<00:23,  1.34it/s]

0.6187964744038053


 48%|████▊     | 29/60 [00:21<00:22,  1.35it/s]

0.6180184053050147


 50%|█████     | 30/60 [00:22<00:21,  1.36it/s]

0.6183225810527802


 52%|█████▏    | 31/60 [00:23<00:21,  1.38it/s]

0.6167570882373385


 53%|█████▎    | 32/60 [00:23<00:20,  1.38it/s]

0.61326378915045


 55%|█████▌    | 33/60 [00:24<00:19,  1.37it/s]

0.6140708062383864


 57%|█████▋    | 34/60 [00:25<00:19,  1.36it/s]

0.6174717247486115


 58%|█████▊    | 35/60 [00:26<00:18,  1.35it/s]

0.6125698089599609


 60%|██████    | 36/60 [00:26<00:17,  1.36it/s]

0.6086746785375807


 62%|██████▏   | 37/60 [00:27<00:16,  1.36it/s]

0.6108434100945791


 63%|██████▎   | 38/60 [00:28<00:16,  1.35it/s]

0.6122548745738136


 65%|██████▌   | 39/60 [00:29<00:15,  1.37it/s]

0.6122669147120582


 67%|██████▋   | 40/60 [00:29<00:14,  1.38it/s]

0.6088969111442566


 68%|██████▊   | 41/60 [00:30<00:13,  1.36it/s]

0.6078337033589681


 70%|███████   | 42/60 [00:31<00:13,  1.37it/s]

0.6078641480869718


 72%|███████▏  | 43/60 [00:31<00:12,  1.38it/s]

0.6102014912499322


 73%|███████▎  | 44/60 [00:32<00:11,  1.37it/s]

0.6076502667533027


 75%|███████▌  | 45/60 [00:33<00:11,  1.36it/s]

0.6077095435725318


 77%|███████▋  | 46/60 [00:34<00:10,  1.36it/s]

0.6101879676183065


 78%|███████▊  | 47/60 [00:34<00:09,  1.33it/s]

0.6088969873057472


 80%|████████  | 48/60 [00:35<00:09,  1.33it/s]

0.6061706278059218


 82%|████████▏ | 49/60 [00:36<00:08,  1.32it/s]

0.6069632371266683


 83%|████████▎ | 50/60 [00:37<00:07,  1.32it/s]

0.604370031091902


 85%|████████▌ | 51/60 [00:37<00:06,  1.34it/s]

0.6053571965959337


 87%|████████▋ | 52/60 [00:38<00:05,  1.35it/s]

0.6042705608738793


 88%|████████▊ | 53/60 [00:39<00:05,  1.36it/s]

0.6049499346150292


 90%|█████████ | 54/60 [00:40<00:04,  1.24it/s]

0.6031258470482297


 92%|█████████▏| 55/60 [00:41<00:03,  1.27it/s]

0.6038433445824517


 93%|█████████▎| 56/60 [00:41<00:03,  1.31it/s]

0.6021944648689694


 95%|█████████▌| 57/60 [00:42<00:02,  1.32it/s]

0.6024720536337959


 97%|█████████▋| 58/60 [00:43<00:01,  1.34it/s]

0.6006739702489641


 98%|█████████▊| 59/60 [00:44<00:00,  1.34it/s]

0.6018064154518975


100%|██████████| 60/60 [00:44<00:00,  1.34it/s]

0.5998433629671732
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [31]:
model_score2

{'Bleu_1': 0.585825922931798,
 'Bleu_2': 0.4526184568433141,
 'Bleu_3': 0.3700476747313412,
 'Bleu_4': 0.31303120977143245,
 'METEOR': 0.2658044587147694,
 'ROUGE_L': 0.49357493700548283,
 'CIDEr': 1.622259648009643,
 'SPICE': 0.33325997980186284,
 'USC_similarity': 0.5608009282757085}

In [32]:
caption_model3, model_score3 = cross_validation(cv[2][0], cv[2][1], 3)    

Split 3:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2661 ==> 895
The vocabulary size is 896.
809 out of 896 words are found in the pre-trained matrix.
The size of embedding_matrix is (896, 200)
Preparing dataloader...

Generating set took: 0:03:59.54


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:59.69
Training...


  2%|▏         | 1/60 [00:00<00:44,  1.32it/s]

5.7362955411275225


  3%|▎         | 2/60 [00:01<00:43,  1.33it/s]

4.445639981163873


  5%|▌         | 3/60 [00:02<00:42,  1.33it/s]

4.107169363233778


  7%|▋         | 4/60 [00:02<00:41,  1.35it/s]

3.724518961376614


  8%|▊         | 5/60 [00:03<00:40,  1.35it/s]

3.2531770600212946


 10%|█         | 6/60 [00:04<00:39,  1.37it/s]

2.830841965145535


 12%|█▏        | 7/60 [00:05<00:38,  1.37it/s]

2.5055078665415444


 13%|█▎        | 8/60 [00:05<00:37,  1.37it/s]

2.2743659019470215


 15%|█▌        | 9/60 [00:06<00:37,  1.37it/s]

2.0920553869671292


 17%|█▋        | 10/60 [00:07<00:36,  1.37it/s]

1.9434380398856268


 18%|█▊        | 11/60 [00:08<00:35,  1.38it/s]

1.8220186630884807


 20%|██        | 12/60 [00:08<00:34,  1.38it/s]

1.7208101219601102


 22%|██▏       | 13/60 [00:09<00:33,  1.39it/s]

1.638394170337253


 23%|██▎       | 14/60 [00:10<00:34,  1.35it/s]

1.5691128174463909


 25%|██▌       | 15/60 [00:11<00:33,  1.32it/s]

1.5035124487347074


 27%|██▋       | 16/60 [00:11<00:32,  1.34it/s]

1.4401509761810303


 28%|██▊       | 17/60 [00:12<00:32,  1.34it/s]

1.383111596107483


 30%|███       | 18/60 [00:13<00:30,  1.36it/s]

1.3110091355111864


 32%|███▏      | 19/60 [00:13<00:29,  1.37it/s]

1.2514982024828594


 33%|███▎      | 20/60 [00:14<00:29,  1.35it/s]

1.2002763284577265


 35%|███▌      | 21/60 [00:15<00:29,  1.31it/s]

1.1581339240074158


 37%|███▋      | 22/60 [00:16<00:29,  1.28it/s]

1.1271691123644512


 38%|███▊      | 23/60 [00:17<00:28,  1.29it/s]

1.096757749716441


 40%|████      | 24/60 [00:17<00:27,  1.32it/s]

1.0628068910704718


 42%|████▏     | 25/60 [00:18<00:26,  1.34it/s]

1.0397584570778742


 43%|████▎     | 26/60 [00:19<00:24,  1.36it/s]

1.017675558725993


 45%|████▌     | 27/60 [00:19<00:24,  1.37it/s]

0.9990520742204454


 47%|████▋     | 28/60 [00:20<00:23,  1.37it/s]

0.9937889178593954


 48%|████▊     | 29/60 [00:21<00:22,  1.38it/s]

0.9804694387647841


 50%|█████     | 30/60 [00:22<00:21,  1.39it/s]

0.9649782247013516


 52%|█████▏    | 31/60 [00:22<00:20,  1.41it/s]

0.9662823213471307


 53%|█████▎    | 32/60 [00:23<00:20,  1.40it/s]

0.9290447367562188


 55%|█████▌    | 33/60 [00:24<00:19,  1.41it/s]

0.903111113442315


 57%|█████▋    | 34/60 [00:24<00:18,  1.41it/s]

0.8782157566812303


 58%|█████▊    | 35/60 [00:25<00:17,  1.41it/s]

0.8630613750881619


 60%|██████    | 36/60 [00:26<00:16,  1.41it/s]

0.8654500709639655


 62%|██████▏   | 37/60 [00:27<00:16,  1.40it/s]

0.8669471409585741


 63%|██████▎   | 38/60 [00:27<00:15,  1.38it/s]

0.8261048561996884


 65%|██████▌   | 39/60 [00:28<00:15,  1.39it/s]

0.8104843729072146


 67%|██████▋   | 40/60 [00:29<00:14,  1.40it/s]

0.7994894219769372


 68%|██████▊   | 41/60 [00:29<00:13,  1.39it/s]

0.7912202013863457


 70%|███████   | 42/60 [00:30<00:13,  1.38it/s]

0.8017164700561099


 72%|███████▏  | 43/60 [00:31<00:12,  1.38it/s]

0.809173627032174


 73%|███████▎  | 44/60 [00:32<00:11,  1.37it/s]

0.7743808262877994


 75%|███████▌  | 45/60 [00:32<00:10,  1.38it/s]

0.7404337128003439


 77%|███████▋  | 46/60 [00:33<00:10,  1.38it/s]

0.7096789909733666


 78%|███████▊  | 47/60 [00:34<00:09,  1.40it/s]

0.6988999446233114


 80%|████████  | 48/60 [00:35<00:08,  1.39it/s]

0.6857397920555539


 82%|████████▏ | 49/60 [00:35<00:07,  1.38it/s]

0.6829068859418234


 83%|████████▎ | 50/60 [00:36<00:07,  1.40it/s]

0.68275562259886


 85%|████████▌ | 51/60 [00:37<00:06,  1.41it/s]

0.6825738814142015


 87%|████████▋ | 52/60 [00:37<00:05,  1.42it/s]

0.6923580335246192


 88%|████████▊ | 53/60 [00:38<00:04,  1.40it/s]

0.6919178035524156


 90%|█████████ | 54/60 [00:39<00:04,  1.41it/s]

0.6995372970898946


 92%|█████████▏| 55/60 [00:40<00:03,  1.40it/s]

0.6886107093758054


 93%|█████████▎| 56/60 [00:40<00:02,  1.40it/s]

0.6803766853279538


 95%|█████████▌| 57/60 [00:41<00:02,  1.39it/s]

0.6694871650801765


 97%|█████████▋| 58/60 [00:42<00:01,  1.38it/s]

0.6567570964495341


 98%|█████████▊| 59/60 [00:42<00:00,  1.38it/s]

0.6669737266169654


100%|██████████| 60/60 [00:43<00:00,  1.37it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

0.6951323482725356


  2%|▏         | 1/60 [00:00<00:41,  1.44it/s]

0.6871179507838355


  3%|▎         | 2/60 [00:01<00:40,  1.42it/s]

0.6501864691575369


  5%|▌         | 3/60 [00:02<00:40,  1.40it/s]

0.6223072012265524


  7%|▋         | 4/60 [00:02<00:40,  1.39it/s]

0.6035942898856269


  8%|▊         | 5/60 [00:03<00:39,  1.39it/s]

0.595614986287223


 10%|█         | 6/60 [00:04<00:38,  1.40it/s]

0.5899471276336246


 12%|█▏        | 7/60 [00:05<00:37,  1.41it/s]

0.585107210609648


 13%|█▎        | 8/60 [00:05<00:37,  1.40it/s]

0.5812973115179274


 15%|█▌        | 9/60 [00:06<00:37,  1.38it/s]

0.5751340985298157


 17%|█▋        | 10/60 [00:07<00:36,  1.39it/s]

0.5783393846617805


 18%|█▊        | 11/60 [00:07<00:34,  1.40it/s]

0.5743687715795305


 20%|██        | 12/60 [00:08<00:33,  1.41it/s]

0.5754245155387454


 22%|██▏       | 13/60 [00:09<00:33,  1.40it/s]

0.568336695432663


 23%|██▎       | 14/60 [00:10<00:32,  1.41it/s]

0.569262703259786


 25%|██▌       | 15/60 [00:10<00:31,  1.42it/s]

0.5681127541595035


 27%|██▋       | 16/60 [00:11<00:31,  1.40it/s]

0.5668472250302633


 28%|██▊       | 17/60 [00:12<00:31,  1.37it/s]

0.5631752643320296


 30%|███       | 18/60 [00:12<00:31,  1.35it/s]

0.5612537893984053


 32%|███▏      | 19/60 [00:13<00:30,  1.35it/s]

0.5617475973235236


 33%|███▎      | 20/60 [00:14<00:29,  1.36it/s]

0.5599437488449944


 35%|███▌      | 21/60 [00:15<00:28,  1.36it/s]

0.5613202684455447


 37%|███▋      | 22/60 [00:15<00:27,  1.36it/s]

0.5576883090866936


 38%|███▊      | 23/60 [00:16<00:26,  1.38it/s]

0.5592848228083717


 40%|████      | 24/60 [00:17<00:25,  1.40it/s]

0.5577635202142928


 42%|████▏     | 25/60 [00:18<00:25,  1.38it/s]

0.5532437397374047


 43%|████▎     | 26/60 [00:18<00:24,  1.40it/s]

0.5572730037901137


 45%|████▌     | 27/60 [00:19<00:23,  1.39it/s]

0.5541394551595052


 47%|████▋     | 28/60 [00:20<00:23,  1.39it/s]

0.5531167818440331


 48%|████▊     | 29/60 [00:20<00:22,  1.39it/s]

0.5545135769579146


 50%|█████     | 30/60 [00:21<00:21,  1.40it/s]

0.5534688631693522


 52%|█████▏    | 31/60 [00:22<00:20,  1.41it/s]

0.5526725583606296


 53%|█████▎    | 32/60 [00:23<00:19,  1.42it/s]

0.5499558813042111


 55%|█████▌    | 33/60 [00:23<00:18,  1.43it/s]

0.5497921208540598


 57%|█████▋    | 34/60 [00:24<00:18,  1.41it/s]

0.548799455165863


 58%|█████▊    | 35/60 [00:25<00:17,  1.39it/s]

0.5485701892111037


 60%|██████    | 36/60 [00:26<00:19,  1.24it/s]

0.547674857907825


 62%|██████▏   | 37/60 [00:26<00:18,  1.28it/s]

0.546385344531801


 63%|██████▎   | 38/60 [00:27<00:16,  1.30it/s]

0.54680292473899


 65%|██████▌   | 39/60 [00:28<00:15,  1.34it/s]

0.5479786760277219


 67%|██████▋   | 40/60 [00:29<00:14,  1.35it/s]

0.5464102824529012


 68%|██████▊   | 41/60 [00:29<00:14,  1.35it/s]

0.5471263461642795


 70%|███████   | 42/60 [00:30<00:13,  1.37it/s]

0.5442341201835208


 72%|███████▏  | 43/60 [00:31<00:12,  1.39it/s]

0.5465599795182546


 73%|███████▎  | 44/60 [00:31<00:11,  1.39it/s]

0.5439131690396203


 75%|███████▌  | 45/60 [00:32<00:10,  1.37it/s]

0.5413195225927565


 77%|███████▋  | 46/60 [00:33<00:10,  1.36it/s]

0.5408879088030921


 78%|███████▊  | 47/60 [00:34<00:09,  1.35it/s]

0.5428700380855136


 80%|████████  | 48/60 [00:34<00:08,  1.35it/s]

0.5417144993940989


 82%|████████▏ | 49/60 [00:35<00:08,  1.37it/s]

0.5418592095375061


 83%|████████▎ | 50/60 [00:36<00:07,  1.36it/s]

0.5403436521689097


 85%|████████▌ | 51/60 [00:37<00:06,  1.37it/s]

0.5371318757534027


 87%|████████▋ | 52/60 [00:37<00:05,  1.37it/s]

0.5391531354851193


 88%|████████▊ | 53/60 [00:38<00:05,  1.37it/s]

0.5360083050198026


 90%|█████████ | 54/60 [00:39<00:04,  1.38it/s]

0.5376760330465105


 92%|█████████▏| 55/60 [00:40<00:03,  1.37it/s]

0.5375023649798499


 93%|█████████▎| 56/60 [00:40<00:02,  1.39it/s]

0.5365623434384664


 95%|█████████▌| 57/60 [00:41<00:02,  1.38it/s]

0.5362632638878293


 97%|█████████▋| 58/60 [00:42<00:01,  1.38it/s]

0.5363567305935754


 98%|█████████▊| 59/60 [00:42<00:00,  1.39it/s]

0.5374708871046702


100%|██████████| 60/60 [00:43<00:00,  1.38it/s]

0.53573563363817
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [33]:
model_score3

{'Bleu_1': 0.5512507902103968,
 'Bleu_2': 0.4113024085775112,
 'Bleu_3': 0.3269619479028768,
 'Bleu_4': 0.268903885803986,
 'METEOR': 0.24111665356645956,
 'ROUGE_L': 0.45651094923800817,
 'CIDEr': 1.3523910336536624,
 'SPICE': 0.3009601259339427,
 'USC_similarity': 0.5323188310313346}

In [34]:
caption_model4, model_score4 = cross_validation(cv[3][0], cv[3][1], 4)    

Split 4:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2681 ==> 916
The vocabulary size is 917.
823 out of 917 words are found in the pre-trained matrix.
The size of embedding_matrix is (917, 200)
Preparing dataloader...

Generating set took: 0:03:50.55


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:56.83
Training...


  2%|▏         | 1/60 [00:00<00:43,  1.36it/s]

7.52719439400567


  3%|▎         | 2/60 [00:01<00:42,  1.37it/s]

5.124629179636638


  5%|▌         | 3/60 [00:02<00:42,  1.35it/s]

4.630448235405816


  7%|▋         | 4/60 [00:03<00:43,  1.28it/s]

4.443623065948486


  8%|▊         | 5/60 [00:03<00:44,  1.23it/s]

4.098739412095812


 10%|█         | 6/60 [00:04<00:44,  1.22it/s]

3.5398416784074573


 12%|█▏        | 7/60 [00:05<00:42,  1.26it/s]

3.0822820133633084


 13%|█▎        | 8/60 [00:06<00:40,  1.30it/s]

2.769583993487888


 15%|█▌        | 9/60 [00:07<00:38,  1.32it/s]

2.4936217731899686


 17%|█▋        | 10/60 [00:07<00:37,  1.33it/s]

2.2801890505684748


 18%|█▊        | 11/60 [00:08<00:36,  1.34it/s]

2.113480885823568


 20%|██        | 12/60 [00:09<00:35,  1.37it/s]

1.9760841925938923


 22%|██▏       | 13/60 [00:09<00:33,  1.38it/s]

1.8689353730943468


 23%|██▎       | 14/60 [00:10<00:33,  1.37it/s]

1.7703835699293349


 25%|██▌       | 15/60 [00:11<00:33,  1.36it/s]

1.6915104389190674


 27%|██▋       | 16/60 [00:12<00:31,  1.38it/s]

1.6259613302018907


 28%|██▊       | 17/60 [00:12<00:30,  1.39it/s]

1.5697189039654202


 30%|███       | 18/60 [00:13<00:30,  1.38it/s]

1.5218022134568956


 32%|███▏      | 19/60 [00:14<00:29,  1.37it/s]

1.473482608795166


 33%|███▎      | 20/60 [00:14<00:29,  1.37it/s]

1.410090724627177


 35%|███▌      | 21/60 [00:15<00:28,  1.39it/s]

1.3659803138838873


 37%|███▋      | 22/60 [00:16<00:28,  1.35it/s]

1.3228942420747545


 38%|███▊      | 23/60 [00:17<00:26,  1.37it/s]

1.286439643965827


 40%|████      | 24/60 [00:17<00:26,  1.37it/s]

1.2507940000957913


 42%|████▏     | 25/60 [00:18<00:25,  1.37it/s]

1.217521157529619


 43%|████▎     | 26/60 [00:19<00:24,  1.37it/s]

1.199440015686883


 45%|████▌     | 27/60 [00:20<00:24,  1.37it/s]

1.1868186725510492


 47%|████▋     | 28/60 [00:20<00:23,  1.38it/s]

1.1838947468333774


 48%|████▊     | 29/60 [00:21<00:22,  1.39it/s]

1.1622988846566942


 50%|█████     | 30/60 [00:22<00:21,  1.38it/s]

1.1597675416204665


 52%|█████▏    | 31/60 [00:22<00:21,  1.38it/s]

1.154880662759145


 53%|█████▎    | 32/60 [00:23<00:20,  1.39it/s]

1.1214596496687994


 55%|█████▌    | 33/60 [00:24<00:19,  1.39it/s]

1.0763050715128581


 57%|█████▋    | 34/60 [00:25<00:18,  1.37it/s]

1.0617028872172039


 58%|█████▊    | 35/60 [00:25<00:18,  1.36it/s]

1.0322838094499376


 60%|██████    | 36/60 [00:26<00:17,  1.37it/s]

1.017735136879815


 62%|██████▏   | 37/60 [00:27<00:16,  1.38it/s]

1.0151416394445631


 63%|██████▎   | 38/60 [00:28<00:15,  1.39it/s]

0.9946948091189066


 65%|██████▌   | 39/60 [00:28<00:15,  1.38it/s]

0.9856408105956184


 67%|██████▋   | 40/60 [00:29<00:14,  1.37it/s]

0.9526637130313449


 68%|██████▊   | 41/60 [00:30<00:13,  1.38it/s]

0.9335499736997817


 70%|███████   | 42/60 [00:30<00:13,  1.38it/s]

0.9199918905893961


 72%|███████▏  | 43/60 [00:31<00:12,  1.38it/s]

0.906757652759552


 73%|███████▎  | 44/60 [00:32<00:11,  1.38it/s]

0.9007538821962144


 75%|███████▌  | 45/60 [00:33<00:11,  1.36it/s]

0.8926220138867696


 77%|███████▋  | 46/60 [00:33<00:10,  1.38it/s]

0.8872076173623403


 78%|███████▊  | 47/60 [00:34<00:09,  1.39it/s]

0.8766678074995676


 80%|████████  | 48/60 [00:35<00:08,  1.38it/s]

0.8706563744280074


 82%|████████▏ | 49/60 [00:35<00:07,  1.40it/s]

0.8647748596138425


 83%|████████▎ | 50/60 [00:36<00:07,  1.39it/s]

0.8516865736908383


 85%|████████▌ | 51/60 [00:37<00:06,  1.38it/s]

0.8413117263052199


 87%|████████▋ | 52/60 [00:38<00:05,  1.39it/s]

0.8357494605912102


 88%|████████▊ | 53/60 [00:38<00:05,  1.40it/s]

0.8275300363699595


 90%|█████████ | 54/60 [00:39<00:04,  1.41it/s]

0.8143937985102335


 92%|█████████▏| 55/60 [00:40<00:03,  1.41it/s]

0.79911208152771


 93%|█████████▎| 56/60 [00:41<00:02,  1.38it/s]

0.7933916548887888


 95%|█████████▌| 57/60 [00:41<00:02,  1.40it/s]

0.7877837419509888


 97%|█████████▋| 58/60 [00:42<00:01,  1.39it/s]

0.7880937622653114


 98%|█████████▊| 59/60 [00:43<00:00,  1.38it/s]

0.7891570793257819


100%|██████████| 60/60 [00:43<00:00,  1.37it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

0.8102025555239784


  2%|▏         | 1/60 [00:00<00:42,  1.38it/s]

0.7745937009652456


  3%|▎         | 2/60 [00:01<00:43,  1.35it/s]

0.7611450784736209


  5%|▌         | 3/60 [00:02<00:44,  1.29it/s]

0.7395907044410706


  7%|▋         | 4/60 [00:03<00:42,  1.31it/s]

0.7238430049684312


  8%|▊         | 5/60 [00:03<00:41,  1.34it/s]

0.7217708064450158


 10%|█         | 6/60 [00:04<00:39,  1.35it/s]

0.7154238190915849


 12%|█▏        | 7/60 [00:05<00:39,  1.36it/s]

0.7104326221677992


 13%|█▎        | 8/60 [00:05<00:38,  1.36it/s]

0.7061129808425903


 15%|█▌        | 9/60 [00:06<00:37,  1.37it/s]

0.7035863399505615


 17%|█▋        | 10/60 [00:07<00:36,  1.38it/s]

0.7010635733604431


 18%|█▊        | 11/60 [00:08<00:35,  1.38it/s]

0.6959320935938094


 20%|██        | 12/60 [00:09<00:39,  1.20it/s]

0.6956551571687063


 22%|██▏       | 13/60 [00:09<00:37,  1.26it/s]

0.694623738527298


 23%|██▎       | 14/60 [00:10<00:35,  1.29it/s]

0.6940913299719492


 25%|██▌       | 15/60 [00:11<00:33,  1.33it/s]

0.6919056077798208


 27%|██▋       | 16/60 [00:12<00:33,  1.32it/s]

0.6915580067369673


 28%|██▊       | 17/60 [00:12<00:31,  1.35it/s]

0.6885861688190036


 30%|███       | 18/60 [00:13<00:31,  1.35it/s]

0.6890061232778761


 32%|███▏      | 19/60 [00:14<00:30,  1.36it/s]

0.685779912604226


 33%|███▎      | 20/60 [00:15<00:29,  1.36it/s]

0.6835002501805624


 35%|███▌      | 21/60 [00:15<00:28,  1.37it/s]

0.685383806626002


 37%|███▋      | 22/60 [00:16<00:27,  1.37it/s]

0.6819849610328674


 38%|███▊      | 23/60 [00:17<00:27,  1.37it/s]

0.6816395819187164


 40%|████      | 24/60 [00:17<00:26,  1.35it/s]

0.6826610962549845


 42%|████▏     | 25/60 [00:18<00:25,  1.35it/s]

0.6794578201240964


 43%|████▎     | 26/60 [00:19<00:25,  1.36it/s]

0.6789345641930898


 45%|████▌     | 27/60 [00:20<00:24,  1.36it/s]

0.679856174521976


 47%|████▋     | 28/60 [00:20<00:23,  1.37it/s]

0.677162786324819


 48%|████▊     | 29/60 [00:21<00:22,  1.37it/s]

0.6768087612258064


 50%|█████     | 30/60 [00:22<00:21,  1.37it/s]

0.6762401660283407


 52%|█████▏    | 31/60 [00:23<00:21,  1.36it/s]

0.6732338534461128


 53%|█████▎    | 32/60 [00:23<00:20,  1.35it/s]

0.6758947968482971


 55%|█████▌    | 33/60 [00:24<00:19,  1.35it/s]

0.6771324806743197


 57%|█████▋    | 34/60 [00:25<00:19,  1.36it/s]

0.672588845094045


 58%|█████▊    | 35/60 [00:26<00:18,  1.36it/s]

0.6708869304921892


 60%|██████    | 36/60 [00:26<00:17,  1.36it/s]

0.670576787657208


 62%|██████▏   | 37/60 [00:27<00:16,  1.38it/s]

0.6717774305078719


 63%|██████▎   | 38/60 [00:28<00:16,  1.37it/s]

0.6730947924984826


 65%|██████▌   | 39/60 [00:28<00:15,  1.38it/s]

0.6703572041458554


 67%|██████▋   | 40/60 [00:29<00:14,  1.40it/s]

0.6714962290392982


 68%|██████▊   | 41/60 [00:30<00:13,  1.40it/s]

0.6706723670164744


 70%|███████   | 42/60 [00:31<00:12,  1.39it/s]

0.6687364677588145


 72%|███████▏  | 43/60 [00:31<00:12,  1.40it/s]

0.6675511002540588


 73%|███████▎  | 44/60 [00:32<00:11,  1.39it/s]

0.667930308315489


 75%|███████▌  | 45/60 [00:33<00:10,  1.39it/s]

0.6678218113051521


 77%|███████▋  | 46/60 [00:33<00:10,  1.37it/s]

0.6645684043566386


 78%|███████▊  | 47/60 [00:34<00:09,  1.32it/s]

0.6643427544169955


 80%|████████  | 48/60 [00:35<00:08,  1.33it/s]

0.6677676207489438


 82%|████████▏ | 49/60 [00:36<00:08,  1.34it/s]

0.6644431187046899


 83%|████████▎ | 50/60 [00:37<00:07,  1.34it/s]

0.6652893523375193


 85%|████████▌ | 51/60 [00:37<00:06,  1.34it/s]

0.6641656226581998


 87%|████████▋ | 52/60 [00:38<00:06,  1.30it/s]

0.6609613729847802


 88%|████████▊ | 53/60 [00:39<00:05,  1.26it/s]

0.6658707559108734


 90%|█████████ | 54/60 [00:40<00:04,  1.28it/s]

0.6628141403198242


 92%|█████████▏| 55/60 [00:40<00:03,  1.31it/s]

0.6635143359502157


 93%|█████████▎| 56/60 [00:41<00:02,  1.34it/s]

0.6663196914725833


 95%|█████████▌| 57/60 [00:42<00:02,  1.36it/s]

0.6626432306236691


 97%|█████████▋| 58/60 [00:43<00:01,  1.38it/s]

0.6630722913477156


 98%|█████████▊| 59/60 [00:43<00:00,  1.39it/s]

0.6590823332468668


100%|██████████| 60/60 [00:44<00:00,  1.35it/s]

0.6582882271872627
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [35]:
model_score4

{'Bleu_1': 0.5796030116358394,
 'Bleu_2': 0.44605851412593095,
 'Bleu_3': 0.3637675366511005,
 'Bleu_4': 0.3063890291961494,
 'METEOR': 0.26040915987569246,
 'ROUGE_L': 0.485221125401786,
 'CIDEr': 1.639486639660922,
 'SPICE': 0.33785721270330155,
 'USC_similarity': 0.5646504696606124}

In [36]:
caption_model5, model_score5 = cross_validation(cv[4][0], cv[4][1], 5)    

Split 5:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions
preprocessed words 2706 ==> 901
The vocabulary size is 902.
814 out of 902 words are found in the pre-trained matrix.
The size of embedding_matrix is (902, 200)
Preparing dataloader...

Generating set took: 0:03:52.00


  0%|          | 0/60 [00:00<?, ?it/s]


Generating set took: 0:00:57.12
Training...


  2%|▏         | 1/60 [00:00<00:43,  1.36it/s]

5.741277641720242


  3%|▎         | 2/60 [00:01<00:42,  1.36it/s]

4.56071122487386


  5%|▌         | 3/60 [00:02<00:41,  1.37it/s]

4.128212849299113


  7%|▋         | 4/60 [00:02<00:40,  1.37it/s]

3.648891713884142


  8%|▊         | 5/60 [00:03<00:40,  1.37it/s]

3.181062009599474


 10%|█         | 6/60 [00:04<00:39,  1.37it/s]

2.794436322318183


 12%|█▏        | 7/60 [00:05<00:38,  1.39it/s]

2.5194973945617676


 13%|█▎        | 8/60 [00:05<00:37,  1.38it/s]

2.2853175136778088


 15%|█▌        | 9/60 [00:06<00:36,  1.38it/s]

2.095007724232144


 17%|█▋        | 10/60 [00:07<00:36,  1.39it/s]

1.9370323684480455


 18%|█▊        | 11/60 [00:07<00:35,  1.38it/s]

1.8103412124845717


 20%|██        | 12/60 [00:08<00:34,  1.40it/s]

1.7002645068698459


 22%|██▏       | 13/60 [00:09<00:34,  1.38it/s]

1.6111289660135906


 23%|██▎       | 14/60 [00:10<00:33,  1.39it/s]

1.5514089398913913


 25%|██▌       | 15/60 [00:10<00:32,  1.39it/s]

1.4899570279651217


 27%|██▋       | 16/60 [00:11<00:32,  1.37it/s]

1.4293912649154663


 28%|██▊       | 17/60 [00:12<00:31,  1.36it/s]

1.3823989629745483


 30%|███       | 18/60 [00:13<00:30,  1.36it/s]

1.3369109895494249


 32%|███▏      | 19/60 [00:13<00:30,  1.36it/s]

1.277735710144043


 33%|███▎      | 20/60 [00:14<00:29,  1.35it/s]

1.2403669489754572


 35%|███▌      | 21/60 [00:15<00:30,  1.29it/s]

1.2414651115735371


 37%|███▋      | 22/60 [00:16<00:28,  1.32it/s]

1.1944323314560785


 38%|███▊      | 23/60 [00:16<00:28,  1.32it/s]

1.1323551734288533


 40%|████      | 24/60 [00:17<00:26,  1.34it/s]

1.089998847908444


 42%|████▏     | 25/60 [00:18<00:26,  1.33it/s]

1.0580456919140286


 43%|████▎     | 26/60 [00:19<00:25,  1.34it/s]

1.0365640189912584


 45%|████▌     | 27/60 [00:19<00:24,  1.34it/s]

1.0148691799905565


 47%|████▋     | 28/60 [00:20<00:23,  1.34it/s]

0.9986842407120599


 48%|████▊     | 29/60 [00:21<00:23,  1.35it/s]

0.9892022344801161


 50%|█████     | 30/60 [00:22<00:21,  1.37it/s]

0.9711843596564399


 52%|█████▏    | 31/60 [00:22<00:20,  1.39it/s]

0.9521377616458468


 53%|█████▎    | 32/60 [00:23<00:20,  1.37it/s]

0.945308181974623


 55%|█████▌    | 33/60 [00:24<00:19,  1.37it/s]

0.9354171885384454


 57%|█████▋    | 34/60 [00:24<00:19,  1.36it/s]

0.9255112674501207


 58%|█████▊    | 35/60 [00:25<00:18,  1.38it/s]

0.8947739203770956


 60%|██████    | 36/60 [00:26<00:17,  1.37it/s]

0.868588795264562


 62%|██████▏   | 37/60 [00:27<00:16,  1.37it/s]

0.8416806095176272


 63%|██████▎   | 38/60 [00:27<00:15,  1.39it/s]

0.8321205013328128


 65%|██████▌   | 39/60 [00:28<00:15,  1.37it/s]

0.8351529406176673


 67%|██████▋   | 40/60 [00:29<00:14,  1.37it/s]

0.8401046329074435


 68%|██████▊   | 41/60 [00:30<00:13,  1.37it/s]

0.8616936571068234


 70%|███████   | 42/60 [00:30<00:13,  1.36it/s]

0.8876435226864285


 72%|███████▏  | 43/60 [00:31<00:12,  1.37it/s]

0.8786770701408386


 73%|███████▎  | 44/60 [00:32<00:11,  1.35it/s]

0.8397060500250922


 75%|███████▌  | 45/60 [00:33<00:11,  1.35it/s]

0.8268968198034499


 77%|███████▋  | 46/60 [00:33<00:10,  1.29it/s]

0.7965229517883725


 78%|███████▊  | 47/60 [00:34<00:09,  1.31it/s]

0.7706766492790647


 80%|████████  | 48/60 [00:35<00:08,  1.34it/s]

0.7435933119720883


 82%|████████▏ | 49/60 [00:36<00:08,  1.35it/s]

0.7344736092620425


 83%|████████▎ | 50/60 [00:36<00:07,  1.36it/s]

0.735601845714781


 85%|████████▌ | 51/60 [00:37<00:06,  1.36it/s]

0.7288972900973426


 87%|████████▋ | 52/60 [00:38<00:05,  1.35it/s]

0.7250240743160248


 88%|████████▊ | 53/60 [00:38<00:05,  1.36it/s]

0.7363492018646665


 90%|█████████ | 54/60 [00:39<00:04,  1.36it/s]

0.7555989027023315


 92%|█████████▏| 55/60 [00:40<00:03,  1.38it/s]

0.7468681765927209


 93%|█████████▎| 56/60 [00:41<00:02,  1.39it/s]

0.7096663514773051


 95%|█████████▌| 57/60 [00:41<00:02,  1.38it/s]

0.6877372860908508


 97%|█████████▋| 58/60 [00:42<00:01,  1.39it/s]

0.672898444864485


 98%|█████████▊| 59/60 [00:43<00:00,  1.39it/s]

0.6780783765845828


100%|██████████| 60/60 [00:44<00:00,  1.35it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

0.6774551967779795


  2%|▏         | 1/60 [00:00<00:43,  1.37it/s]

0.6640706956386566


  3%|▎         | 2/60 [00:01<00:42,  1.37it/s]

0.64215773012903


  5%|▌         | 3/60 [00:02<00:41,  1.38it/s]

0.6187707483768463


  7%|▋         | 4/60 [00:02<00:40,  1.37it/s]

0.6029715140660604


  8%|▊         | 5/60 [00:03<00:39,  1.38it/s]

0.5932192438178592


 10%|█         | 6/60 [00:04<00:39,  1.36it/s]

0.5840161508984036


 12%|█▏        | 7/60 [00:05<00:38,  1.36it/s]

0.5782058073414696


 13%|█▎        | 8/60 [00:05<00:38,  1.37it/s]

0.5764071610238817


 15%|█▌        | 9/60 [00:06<00:37,  1.36it/s]

0.5721287495560117


 17%|█▋        | 10/60 [00:07<00:36,  1.38it/s]

0.5713270008563995


 18%|█▊        | 11/60 [00:07<00:35,  1.40it/s]

0.567888832754559


 20%|██        | 12/60 [00:08<00:34,  1.38it/s]

0.5671586791674296


 22%|██▏       | 13/60 [00:09<00:33,  1.40it/s]

0.5635672907034556


 23%|██▎       | 14/60 [00:10<00:33,  1.39it/s]

0.5651365849706862


 25%|██▌       | 15/60 [00:10<00:32,  1.38it/s]

0.5611238578955332


 27%|██▋       | 16/60 [00:11<00:31,  1.38it/s]

0.5599059893025292


 28%|██▊       | 17/60 [00:12<00:31,  1.38it/s]

0.5593868527147505


 30%|███       | 18/60 [00:13<00:30,  1.38it/s]

0.556883285442988


 32%|███▏      | 19/60 [00:13<00:29,  1.38it/s]

0.5617972248130374


 33%|███▎      | 20/60 [00:14<00:29,  1.38it/s]

0.5591660704877641


 35%|███▌      | 21/60 [00:15<00:28,  1.37it/s]

0.5562405718697442


 37%|███▋      | 22/60 [00:15<00:27,  1.37it/s]

0.5544161763456132


 38%|███▊      | 23/60 [00:16<00:26,  1.38it/s]

0.5544184413221147


 40%|████      | 24/60 [00:17<00:25,  1.39it/s]

0.5521129237280952


 42%|████▏     | 25/60 [00:18<00:25,  1.37it/s]

0.5521030326684316


 43%|████▎     | 26/60 [00:18<00:25,  1.35it/s]

0.5505931377410889


 45%|████▌     | 27/60 [00:19<00:24,  1.35it/s]

0.5524012214607663


 47%|████▋     | 28/60 [00:20<00:23,  1.35it/s]

0.5510298775302039


 48%|████▊     | 29/60 [00:21<00:22,  1.36it/s]

0.5488188531663682


 50%|█████     | 30/60 [00:21<00:21,  1.36it/s]

0.5475831164254082


 52%|█████▏    | 31/60 [00:22<00:20,  1.39it/s]

0.5485957761605581


 53%|█████▎    | 32/60 [00:23<00:20,  1.38it/s]

0.5490890774461958


 55%|█████▌    | 33/60 [00:23<00:19,  1.38it/s]

0.5466204384962717


 57%|█████▋    | 34/60 [00:24<00:18,  1.38it/s]

0.5471294224262238


 58%|█████▊    | 35/60 [00:25<00:18,  1.38it/s]

0.5464851491981082


 60%|██████    | 36/60 [00:26<00:17,  1.37it/s]

0.5464287565814124


 62%|██████▏   | 37/60 [00:26<00:16,  1.36it/s]

0.544079883231057


 63%|██████▎   | 38/60 [00:27<00:15,  1.38it/s]

0.5474189519882202


 65%|██████▌   | 39/60 [00:28<00:15,  1.38it/s]

0.5444265272882249


 67%|██████▋   | 40/60 [00:29<00:14,  1.37it/s]

0.5407533877425723


 68%|██████▊   | 41/60 [00:29<00:13,  1.36it/s]

0.5410366621282365


 70%|███████   | 42/60 [00:30<00:13,  1.35it/s]

0.5402789778179593


 72%|███████▏  | 43/60 [00:31<00:12,  1.37it/s]

0.5395753516091241


 73%|███████▎  | 44/60 [00:32<00:11,  1.39it/s]

0.5388572613398234


 75%|███████▌  | 45/60 [00:32<00:10,  1.38it/s]

0.5397464599874284


 77%|███████▋  | 46/60 [00:33<00:10,  1.40it/s]

0.5412185258335538


 78%|███████▊  | 47/60 [00:34<00:09,  1.40it/s]

0.5389618939823575


 80%|████████  | 48/60 [00:34<00:08,  1.40it/s]

0.5392910109625922


 82%|████████▏ | 49/60 [00:35<00:07,  1.39it/s]

0.5362198816405402


 83%|████████▎ | 50/60 [00:36<00:07,  1.32it/s]

0.534106559223599


 85%|████████▌ | 51/60 [00:37<00:06,  1.29it/s]

0.5337804787688785


 87%|████████▋ | 52/60 [00:37<00:06,  1.32it/s]

0.5373607377211252


 88%|████████▊ | 53/60 [00:38<00:05,  1.36it/s]

0.536438912153244


 90%|█████████ | 54/60 [00:39<00:04,  1.36it/s]

0.5359966589344872


 92%|█████████▏| 55/60 [00:40<00:03,  1.36it/s]

0.5339375270737542


 93%|█████████▎| 56/60 [00:40<00:02,  1.37it/s]

0.533739858203464


 95%|█████████▌| 57/60 [00:41<00:02,  1.38it/s]

0.5360089904732175


 97%|█████████▋| 58/60 [00:42<00:01,  1.38it/s]

0.5307264493571388


 98%|█████████▊| 59/60 [00:42<00:00,  1.39it/s]

0.53357951508628


100%|██████████| 60/60 [00:43<00:00,  1.37it/s]

0.5310547086927626
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [37]:
model_score5

{'Bleu_1': 0.5698063698877773,
 'Bleu_2': 0.4364792688567016,
 'Bleu_3': 0.35501624589403963,
 'Bleu_4': 0.2977226778566265,
 'METEOR': 0.2560261543785319,
 'ROUGE_L': 0.4800315771359558,
 'CIDEr': 1.6053627244544755,
 'SPICE': 0.3238653743761547,
 'USC_similarity': 0.553482708636809}

In [38]:
model_scores = defaultdict(list)
for scores in [model_score1, model_score2, model_score3, model_score4, model_score5]:
    for key, value in scores.items():
        model_scores[key].append(value)

In [39]:
model_scores

defaultdict(list,
            {'Bleu_1': [0.44370052024426854,
              0.585825922931798,
              0.5512507902103968,
              0.5796030116358394,
              0.5698063698877773],
             'Bleu_2': [0.27169985240406924,
              0.4526184568433141,
              0.4113024085775112,
              0.44605851412593095,
              0.4364792688567016],
             'Bleu_3': [0.18494146269162032,
              0.3700476747313412,
              0.3269619479028768,
              0.3637675366511005,
              0.35501624589403963],
             'Bleu_4': [0.13498287773923187,
              0.31303120977143245,
              0.268903885803986,
              0.3063890291961494,
              0.2977226778566265],
             'METEOR': [0.167418534027843,
              0.2658044587147694,
              0.24111665356645956,
              0.26040915987569246,
              0.2560261543785319],
             'ROUGE_L': [0.3427375729530091,
              0.4935749370

In [None]:
tag = '9.1.2'
with open(f'{root_captioning}/fz_notebooks/cv_n{tag}.json', 'w') as fp:
    json.dump(model_scores, fp)