## Image Captioning with Pytorch

The following contents are modified from MDS DSCI 575 lecture 8 demo

In [1]:
import os, sys, json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.model_selection import KFold

sys.path.append('../../scr/evaluation')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.usc_sim.usc_sim import usc_sim
import subprocess


START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True


In [2]:
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"
        
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [5]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [6]:
# get img path and caption list
# # only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)



In [7]:
all_paths = train_paths.copy()
all_paths.extend(test_paths.copy())
all_paths = np.array(all_paths)

all_descriptions = train_descriptions.copy()
all_descriptions.extend(test_descriptions.copy())
all_descriptions = np.array(all_descriptions)

captions = all_descriptions.copy()
max_length_all = max(max_length_train, max_length_test)
max_length = max_length_all + 2
      
lex = set()
for sen in all_descriptions:
    [lex.update(d.split()) for d in sen]
    
# add a start and stop token at the beginning/end
for v in all_descriptions:
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'
        
print(f'There are {len(all_paths)} images') 
print(f'There are {len(lex)} unique words (vocab)')
print(f'The maximum length of captions with start and stop token is {max_length}.')


There are 10416 images
There are 2912 unique words (vocab)
The maximum length of captions with start and stop token is 36.


In [8]:
all_paths[-1]

'../../s3/valid/rsicd_park_33.jpg'

In [9]:
all_descriptions[-1]

array(['startseq a vast artificial lake was built in the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq there are many residential areas near the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq',
       'startseq a vast artificial lake was built in the park . endseq'],
      dtype='<U184')

### Loading Wikipedia2vec Embeddings

In [10]:
# read the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
    embeddings_index = json.load(file)

In [11]:
def get_vocab(descriptions, word_count_threshold=10):

    captions = []
    for val in descriptions:
        for cap in val:
            captions.append(cap)
    print(f'There are {len(captions)} captions')
    
    word_counts = {}
    nsents = 0
    for sent in captions:
        nsents += 1
        for w in sent.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
    return vocab

def get_word_dict(vocab):
    
    idxtoword = {}
    wordtoidx = {}

    ix = 1
    for w in vocab:
        wordtoidx[w] = ix
        idxtoword[ix] = w
        ix += 1

    return idxtoword, wordtoidx

def get_vocab_size(idxtoword):
    
    print(f'The vocabulary size is {len(idxtoword) + 1}.')
    return len(idxtoword) + 1


def get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx):

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count = 0

    for word, i in wordtoidx.items():

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            count += 1
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
            
    print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')            
    print(f'The size of embedding_matrix is {embedding_matrix.shape}')
    return embedding_matrix

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
# inception v3 expects (299, 299) sized images
inc_model =\
models.inception_v3(
    pretrained=True,
    aux_logits=False
).to(device)

In [14]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()
        
        # remove the classification layer
        self.model =\
        nn.Sequential(
            *(list(inc_model.children())[: 3]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(inc_model.children())[3: 5]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(inc_model.children())[5: -1])
        )

        self.input_size = 299

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
          # set the model to evaluation model
          self.model.eval()

        # N x 3 x 299 x 299
        features = self.model(img_input)
        # N x 2048 x 8 x 8

        return features

In [15]:
class AttentionModel(nn.Module):

    def __init__(self, feature_size, embedding_dim, hidden_size=256):
        """
        Initializes a AttentionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(AttentionModel, self).__init__()

        self.W_sh = nn.Linear(hidden_size, hidden_size)
        self.W_attr = nn.Linear(1000, hidden_size)

    def forward(self, img_features, img_sf, h):
        """
        forward of the AttentionModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """

        # N x feature_size
        attr = self.W_attr(img_sf).unsqueeze(1).repeat(1, img_features.size(1), 1)
        # N x 64 x hidden_size

        # attention scoring function s(W_sh[h, W_attr(attr)])
        # W_sh[h, W_attr(attr)]
        energy =\
        torch.tanh(self.W_sh(
            img_features + attr
        )).permute(0, 2, 1)
        # N x hidden_size x 64

        # s(W_sh[h, W_attr(attr)])
        # torch.bmm takes 3D tensors
        attention = torch.bmm(h.unsqueeze(1), energy)
        # N x 1 x 64

        attention_weights = F.softmax(attention, dim=2).squeeze(1)
        # N x 64

        return attention_weights

In [16]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        feature_size,
        vocab_size,
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        feature_size: int
            the number of features in the image matrix
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()

        self.feature_size = feature_size
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(p=0.5)
        self.relu = nn.ReLU()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dense1 = nn.Linear(1000, hidden_size)
        self.dense2 = nn.Linear(1000, hidden_size)
        self.dense3 = nn.Linear(feature_size, hidden_size)

        self.embedding =\
        nn.Embedding(
            vocab_size,
            embedding_dim, 
            padding_idx=0
        )

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
                'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.attention =\
        AttentionModel(
            feature_size,
            embedding_dim,
            hidden_size
        )

        self.lstm =\
        nn.LSTMCell(
            embedding_dim, 
            hidden_size, 
            bias=True
        )
      
    def forward(self, img_features, img_sf, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        img_features: torch.Tensor 
            the image feature matrix
            (N x feature_size(2048) x 8 x 8)
        captions: torch.Tensor 
            the padded caption matrix
            (N x seq_len)

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # N = batch_size
        batch_size = captions.size(0)
        seq_len = captions.size(1)

        # N x feature_size(2048) x 8 x 8
        img_features = img_features.view(
            batch_size, self.feature_size, -1
        ).permute(0, 2, 1)
        # N x 64 x feature_size(2048)

        # N x 64 x feature_size(2048)
        h = self.dense1(img_sf)
        c = self.dense2(img_sf)
        # N x hidden_size     

        # N x seq_len
        embed = self.dropout(self.embedding(captions))
        # N x seq_len x embedding_dim 

        # N x 64 x feature_size(2048)
        img_features = self.dropout(self.relu(self.dense3(img_features)))
        # N x 64 x embedding_dim

        outputs =\
        torch.zeros(
            batch_size,
            seq_len, 
            self.hidden_size
        ).to(device)

        all_attention_weights =\
        torch.zeros(
            batch_size,
            seq_len, 
            img_features.shape[1]
        ).to(device)
        
        for i in range(seq_len):
            
            attention_weights = self.attention(img_features, img_sf, h)
            # N x 64

            # weighted sum of img_features
            weighted = (img_features * attention_weights.unsqueeze(2)).sum(dim=1)
            # N x embedding_dim

            h, c =\
            self.lstm(
                embed[:, i, :] + weighted, 
                (h, c)
            )
            # h: N x hidden_size
            # c: N x hidden_size
 
            output =\
            self.dense(
                h + embed[:, i, :] + weighted
                
            )
            outputs[:, i, :] = output
            all_attention_weights[:, i, :] = attention_weights.squeeze()

        return outputs, all_attention_weights



In [17]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        self.feature_size = 2048

        self.decoder = RNNModel(
            self.feature_size,
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )


        self.dropout = nn.Dropout(p=0.5)        
        self.relu2 = nn.ReLU()
#         self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    # def forward(self, captions):
    def forward(self, img_features, img_sf, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor 
            the image feature matrix
            (N x feature_size(2048) x 8 x 8)
        captions: torch.Tensor 
            the padded caption matrix
            (N x seq_len)

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """
    
        decoder_out, all_attention_weights = self.decoder(img_features, img_sf, captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                    self.dropout(decoder_out)
                    
                )
            )
        

        return outputs, all_attention_weights

### Train the Neural Network

In [18]:
def train(model, iterator, optimizer, criterion, clip, vocab_size):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, img_sf, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs, all_attention_weights = model(
            img_features.to(device),
            img_sf.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        ) + ((1. - all_attention_weights.sum(dim=1)) ** 2).mean()
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [19]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img1, img2 = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img1, img2, in_seq


In [20]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [21]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape).to(device)
    # Call model to extract the smaller feature set for the image.
    x1 = model(img, False).squeeze()
    inc_model.eval()
    x2 = inc_model(img).squeeze()

    return x1, x2

In [22]:
def extract_img_features(img_paths, model):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    start = time()
    img_features = []

    for image_path in tqdm(img_paths):
        x1, x2 = encode_image(model, image_path)
        img_features.append(
            [x1.cpu().data.numpy(), x2.cpu().data.numpy()]
        )

    print(f"\nGenerating set took: {hms_string(time()-start)}")

    return img_features

In [23]:
def get_train_test(
    encoder,
    train_paths,
    test_paths
):

    train_img_features = extract_img_features(
        train_paths,
        encoder
    )

    test_img_features = extract_img_features(
        test_paths,
        encoder
    )
    return train_img_features, test_img_features

def get_train_dataloader(
    train_descriptions, 
    train_img_features,
    wordtoidx,
    max_length,
    batch_size=200
):
    train_dataset = SampleDataset(
        train_descriptions,
        train_img_features,
        wordtoidx,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size
    )
    
    return train_loader

def train_model(
    train_loader,
    vocab_size,
    embedding_dim, 
    embedding_matrix,
    hidden_size=256,
):

    caption_model = CaptionModel(
        vocab_size, 
        embedding_dim, 
        hidden_size=500,
        embedding_matrix=embedding_matrix, 
        embedding_train=True
    )

    init_weights(
        caption_model,
        embedding_pretrained=True
    )

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(
        caption_model.parameters(), 
        lr=0.01
    )

    clip = 1
    start = time()

    for i in tqdm(range(EPOCHS * 3)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)

    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 3)):

        loss = train(caption_model, train_loader, optimizer, criterion, clip, vocab_size)
        print(loss)
    return caption_model

In [24]:
def generateCaption(
    model, 
    img_features,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        model.eval()
        yhat, _ = model(
            torch.FloatTensor(img_features[0])\
            .view(-1, model.feature_size).to(device),
            torch.FloatTensor(img_features[1])\
            .view(1, 1000).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        
        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

### Evaluation

In [25]:
def eval_model(ref_data, results):
    """
    Computes evaluation metrics of the model results against the human annotated captions
    
    Parameters:
    ------------
    ref_data: dict
        a dictionary containing human annotated captions, with image name as key and a list of human annotated captions as values
    
    results: dict
        a dictionary containing model generated caption, with image name as key and a generated caption as value
        
    Returns:
    ------------
    score_dict: a dictionary containing the overall average score for the model
    """
    # download stanford nlp library
    subprocess.call(['../../scr/evaluation/get_stanford_models.sh'])
    
    # format the inputs
    gts = {}
    res = {}

    for imgId in range(len(ref_data)):
        caption_list_sel = []
        for i in range(5):
            lst = {}
            lst['caption'] = ref_data[imgId][i]
            lst['image_id'] = imgId
            lst['id'] = i
            caption_list_sel.append(lst)
        gts[imgId] = caption_list_sel

        res[imgId] = [{'caption': results[imgId]}]
        
    # tokenize
    print('tokenization...')
    tokenizer = PTBTokenizer()
    gts  = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)
    
    # compute scores
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Meteor(),"METEOR"),
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        (Spice(), "SPICE"),
        (usc_sim(), "USC_similarity"),  
        ]
    score_dict = {}
    for scorer, method in scorers:
        print('computing %s score...'%(scorer.method()))
        score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, scs, m in zip(score, scores, method):
                score_dict[m] = sc
        else:
            score_dict[method] = score
            
    return score_dict


In [26]:
def evaluate_results(
    test_img_features, 
    model,
    ref,
    max_length,
    vocab_size,
    wordtoidx,
    idxtoword
):
    # generate results
    print('Generating captions...')
    results = {}
    for n in range(len(test_img_features)):
        img_features = test_img_features[n]
        generated = generateCaption(
            model, 
            img_features,
            max_length,
            vocab_size,
            wordtoidx,
            idxtoword
        )
        results[n] = generated
        
    model_score = eval_model(ref, results)

    return model_score

### Cross validation

In [27]:
cnn_type = 'inception_v3'
encoder = CNNModel(pretrained=True)
encoder.to(device)

CNNModel(
  (model): Sequential(
    (0): BasicConv2d(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicConv2d(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): BasicConv2d(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): BasicConv2d(
      (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): BasicConv2d(
      (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1

In [28]:
def cross_validation(train_index, test_index, count):
    print('=' * 60)
    print(f'Split {count}:')
    print(f'Splitting data...')
    
    train_paths, test_paths = all_paths[train_index], all_paths[test_index]
    train_descriptions, test_descriptions = all_descriptions[train_index], all_descriptions[test_index]
    print(f'{len(train_paths)} images for training and {len(test_paths)} images for testing.')
    
    vocab = get_vocab(train_descriptions, word_count_threshold=10)
    idxtoword, wordtoidx = get_word_dict(vocab)
    vocab_size = get_vocab_size(idxtoword)
    embedding_dim = 500
    embedding_matrix = get_embeddings(embeddings_index, vocab_size, embedding_dim, wordtoidx) 

    print(f'Preparing dataloader...')
    train_img_features, test_img_features = get_train_test(encoder, train_paths, test_paths)

    train_loader = get_train_dataloader(
        train_descriptions, 
        train_img_features,
        wordtoidx,
        max_length,
        batch_size=1000
    )

    print(f'Training...')
    caption_model = train_model(
        train_loader,
        vocab_size,
        embedding_dim, 
        embedding_matrix,
        hidden_size=500
    )

    
    ref = captions[test_index]
    model_score = evaluate_results(
        test_img_features, 
        caption_model,
        ref,
        max_length,
        vocab_size,
        wordtoidx,
        idxtoword
    )
    
    return caption_model, model_score

In [29]:
cv = KFold(n_splits=5, random_state=123, shuffle=True)
cv = [(train_index, test_index) for train_index, test_index in cv.split(all_paths)]  

In [30]:
caption_model1, model_score1 = cross_validation(cv[0][0], cv[0][1], 1)    

Split 1:
Splitting data...
8332 images for training and 2084 images for testing.
There are 41660 captions


  0%|          | 0/8332 [00:00<?, ?it/s]

preprocessed words 2659 ==> 884
The vocabulary size is 885.
793 out of 885 words are found in the pre-trained matrix.
The size of embedding_matrix is (885, 500)
Preparing dataloader...


100%|██████████| 8332/8332 [06:20<00:00, 21.89it/s]
  0%|          | 3/2084 [00:00<01:41, 20.58it/s]


Generating set took: 0:06:20.58


100%|██████████| 2084/2084 [01:42<00:00, 20.23it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:01:43.00
Training...


  3%|▎         | 1/30 [00:06<03:22,  6.97s/it]

14.241856469048393


  7%|▋         | 2/30 [00:13<03:14,  6.95s/it]

5.502065870496962


 10%|█         | 3/30 [00:20<03:07,  6.94s/it]

4.509341875712077


 13%|█▎        | 4/30 [00:27<03:00,  6.93s/it]

3.5813718107011585


 17%|█▋        | 5/30 [00:34<02:53,  6.93s/it]

2.969064580069648


 20%|██        | 6/30 [00:41<02:46,  6.92s/it]

2.6460874610477023


 23%|██▎       | 7/30 [00:48<02:40,  6.96s/it]

2.425721221499973


 27%|██▋       | 8/30 [00:55<02:32,  6.95s/it]

2.226936181386312


 30%|███       | 9/30 [01:02<02:25,  6.94s/it]

2.068994919459025


 33%|███▎      | 10/30 [01:09<02:18,  6.94s/it]

1.9438629150390625


 37%|███▋      | 11/30 [01:16<02:11,  6.94s/it]

1.8063469727834065


 40%|████      | 12/30 [01:23<02:04,  6.93s/it]

1.702781253390842


 43%|████▎     | 13/30 [01:30<01:57,  6.93s/it]

1.616714358329773


 47%|████▋     | 14/30 [01:37<01:50,  6.93s/it]

1.5472846296098497


 50%|█████     | 15/30 [01:43<01:43,  6.93s/it]

1.4778325292799208


 53%|█████▎    | 16/30 [01:50<01:36,  6.92s/it]

1.4234713580873277


 57%|█████▋    | 17/30 [01:57<01:29,  6.91s/it]

1.377953913476732


 60%|██████    | 18/30 [02:04<01:22,  6.90s/it]

1.3232045968373616


 63%|██████▎   | 19/30 [02:11<01:15,  6.90s/it]

1.2819222145610385


 67%|██████▋   | 20/30 [02:18<01:09,  6.90s/it]

1.2366146842638652


 70%|███████   | 21/30 [02:25<01:02,  6.92s/it]

1.1998345586988661


 73%|███████▎  | 22/30 [02:32<00:55,  6.92s/it]

1.1714219119813707


 77%|███████▋  | 23/30 [02:39<00:48,  6.93s/it]

1.1305961807568867


 80%|████████  | 24/30 [02:46<00:41,  6.93s/it]

1.0841115315755208


 83%|████████▎ | 25/30 [02:53<00:34,  6.93s/it]

1.0600732498698764


 87%|████████▋ | 26/30 [03:00<00:27,  6.92s/it]

1.037146011988322


 90%|█████████ | 27/30 [03:06<00:20,  6.92s/it]

1.0202982756826613


 93%|█████████▎| 28/30 [03:13<00:13,  6.92s/it]

0.9997446139653524


 97%|█████████▋| 29/30 [03:20<00:06,  6.92s/it]

0.9931271473566691


100%|██████████| 30/30 [03:27<00:00,  6.92s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

0.9903223315874735


  3%|▎         | 1/30 [00:06<03:19,  6.89s/it]

0.9437605076366


  7%|▋         | 2/30 [00:13<03:13,  6.90s/it]

0.9287349515491061


 10%|█         | 3/30 [00:20<03:06,  6.89s/it]

0.9069548712836372


 13%|█▎        | 4/30 [00:27<02:59,  6.89s/it]

0.8974574605623881


 17%|█▋        | 5/30 [00:34<02:52,  6.90s/it]

0.8817481398582458


 20%|██        | 6/30 [00:41<02:45,  6.90s/it]

0.8751171032587687


 23%|██▎       | 7/30 [00:48<02:38,  6.90s/it]

0.869039879904853


 27%|██▋       | 8/30 [00:55<02:31,  6.90s/it]

0.8643866115146213


 30%|███       | 9/30 [01:02<02:25,  6.91s/it]

0.8619369864463806


 33%|███▎      | 10/30 [01:09<02:18,  6.91s/it]

0.8548122578197055


 37%|███▋      | 11/30 [01:15<02:11,  6.91s/it]

0.8504064016871982


 40%|████      | 12/30 [01:22<02:04,  6.91s/it]

0.8494090636571249


 43%|████▎     | 13/30 [01:29<01:57,  6.91s/it]

0.8407705492443509


 47%|████▋     | 14/30 [01:36<01:50,  6.91s/it]

0.8394289215405782


 50%|█████     | 15/30 [01:43<01:43,  6.90s/it]

0.8413042691018846


 53%|█████▎    | 16/30 [01:50<01:36,  6.90s/it]

0.8324873381190829


 57%|█████▋    | 17/30 [01:57<01:29,  6.91s/it]

0.8260330491595798


 60%|██████    | 18/30 [02:04<01:22,  6.91s/it]

0.8281701670752631


 63%|██████▎   | 19/30 [02:11<01:15,  6.91s/it]

0.8299438953399658


 67%|██████▋   | 20/30 [02:18<01:09,  6.91s/it]

0.8213530447747972


 70%|███████   | 21/30 [02:25<01:02,  6.91s/it]

0.8225940002335442


 73%|███████▎  | 22/30 [02:32<00:55,  6.96s/it]

0.8216497434510125


 77%|███████▋  | 23/30 [02:39<00:48,  6.95s/it]

0.8150665428903368


 80%|████████  | 24/30 [02:45<00:41,  6.94s/it]

0.8165645864274766


 83%|████████▎ | 25/30 [02:52<00:34,  6.94s/it]

0.8133929239379035


 87%|████████▋ | 26/30 [02:59<00:27,  6.94s/it]

0.8109083043204414


 90%|█████████ | 27/30 [03:06<00:20,  6.95s/it]

0.8103513320287069


 93%|█████████▎| 28/30 [03:13<00:13,  6.95s/it]

0.8093924853536818


 97%|█████████▋| 29/30 [03:20<00:06,  6.95s/it]

0.8061017658975389


100%|██████████| 30/30 [03:27<00:00,  6.92s/it]

0.8082359631856283
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [31]:
model_score1

{'Bleu_1': 0.5534123965092614,
 'Bleu_2': 0.4179403265571349,
 'Bleu_3': 0.3352368897321505,
 'Bleu_4': 0.2783509536825586,
 'METEOR': 0.2455818221305221,
 'ROUGE_L': 0.45918757045295083,
 'CIDEr': 1.4322018320637138,
 'SPICE': 0.3068188612643051,
 'USC_similarity': 0.5323410785649751}

In [32]:
caption_model2, model_score2 = cross_validation(cv[1][0], cv[1][1], 2)    

Split 2:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 3/8333 [00:00<06:36, 21.03it/s]

preprocessed words 2688 ==> 916
The vocabulary size is 917.
819 out of 917 words are found in the pre-trained matrix.
The size of embedding_matrix is (917, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [06:38<00:00, 20.92it/s]
  0%|          | 3/2083 [00:00<01:38, 21.03it/s]


Generating set took: 0:06:38.39


100%|██████████| 2083/2083 [01:42<00:00, 20.32it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:01:42.53
Training...


  3%|▎         | 1/30 [00:06<03:22,  6.98s/it]

14.161600642734104


  7%|▋         | 2/30 [00:13<03:14,  6.95s/it]

5.387671947479248


 10%|█         | 3/30 [00:20<03:07,  6.93s/it]

4.553518189324273


 13%|█▎        | 4/30 [00:27<03:00,  6.93s/it]

3.7121134863959417


 17%|█▋        | 5/30 [00:34<02:53,  6.94s/it]

3.0764889452192516


 20%|██        | 6/30 [00:41<02:46,  6.95s/it]

2.7114881144629583


 23%|██▎       | 7/30 [00:48<02:40,  6.97s/it]

2.4850462012820773


 27%|██▋       | 8/30 [00:55<02:33,  6.97s/it]

2.295097748438517


 30%|███       | 9/30 [01:02<02:26,  6.96s/it]

2.1230044629838734


 33%|███▎      | 10/30 [01:09<02:19,  6.96s/it]

1.9813730319341023


 37%|███▋      | 11/30 [01:16<02:12,  6.95s/it]

1.8789847956763372


 40%|████      | 12/30 [01:23<02:05,  6.95s/it]

1.7965082724889119


 43%|████▎     | 13/30 [01:30<01:57,  6.94s/it]

1.7280716631147597


 47%|████▋     | 14/30 [01:37<01:50,  6.94s/it]

1.6470760239495172


 50%|█████     | 15/30 [01:44<01:44,  6.93s/it]

1.5694865120781794


 53%|█████▎    | 16/30 [01:51<01:37,  6.95s/it]

1.4976539214452107


 57%|█████▋    | 17/30 [01:58<01:30,  6.94s/it]

1.4334564871258206


 60%|██████    | 18/30 [02:05<01:23,  6.95s/it]

1.3856255610783894


 63%|██████▎   | 19/30 [02:12<01:16,  6.97s/it]

1.3406002124150593


 67%|██████▋   | 20/30 [02:19<01:09,  6.96s/it]

1.2993375460306804


 70%|███████   | 21/30 [02:25<01:02,  6.96s/it]

1.2529123226801555


 73%|███████▎  | 22/30 [02:33<00:56,  7.00s/it]

1.2204463216993544


 77%|███████▋  | 23/30 [02:39<00:48,  6.98s/it]

1.169019533528222


 80%|████████  | 24/30 [02:46<00:41,  6.98s/it]

1.1441376739078097


 83%|████████▎ | 25/30 [02:53<00:34,  6.97s/it]

1.1135898100005255


 87%|████████▋ | 26/30 [03:00<00:27,  6.96s/it]

1.092615670628018


 90%|█████████ | 27/30 [03:07<00:20,  6.95s/it]

1.091814120610555


 93%|█████████▎| 28/30 [03:14<00:13,  6.95s/it]

1.0599300265312195


 97%|█████████▋| 29/30 [03:21<00:06,  6.94s/it]

1.0387517809867859


100%|██████████| 30/30 [03:28<00:00,  6.95s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

1.0105838643179998


  3%|▎         | 1/30 [00:06<03:21,  6.95s/it]

0.9709364970525106


  7%|▋         | 2/30 [00:13<03:14,  6.95s/it]

0.9512356453471713


 10%|█         | 3/30 [00:20<03:07,  6.94s/it]

0.9376973112424215


 13%|█▎        | 4/30 [00:27<03:00,  6.93s/it]

0.936724411116706


 17%|█▋        | 5/30 [00:34<02:53,  6.93s/it]

0.9209036893314786


 20%|██        | 6/30 [00:41<02:46,  6.93s/it]

0.9139154752095541


 23%|██▎       | 7/30 [00:48<02:39,  6.93s/it]

0.910322474108802


 27%|██▋       | 8/30 [00:55<02:32,  6.93s/it]

0.9064883059925504


 30%|███       | 9/30 [01:02<02:25,  6.94s/it]

0.9035360349549187


 33%|███▎      | 10/30 [01:09<02:18,  6.94s/it]

0.8990218705601163


 37%|███▋      | 11/30 [01:16<02:11,  6.95s/it]

0.8941388328870138


 40%|████      | 12/30 [01:23<02:05,  6.95s/it]

0.8900756107436286


 43%|████▎     | 13/30 [01:30<01:58,  6.95s/it]

0.886458138624827


 47%|████▋     | 14/30 [01:37<01:51,  6.95s/it]

0.8795042634010315


 50%|█████     | 15/30 [01:44<01:44,  6.94s/it]

0.8781264026959738


 53%|█████▎    | 16/30 [01:51<01:37,  6.95s/it]

0.8769671453369988


 57%|█████▋    | 17/30 [01:57<01:30,  6.95s/it]

0.8773770597245958


 60%|██████    | 18/30 [02:04<01:23,  6.94s/it]

0.8738349344995286


 63%|██████▎   | 19/30 [02:11<01:16,  6.93s/it]

0.8711157904730903


 67%|██████▋   | 20/30 [02:18<01:09,  6.92s/it]

0.8665726714664035


 70%|███████   | 21/30 [02:25<01:02,  6.90s/it]

0.8687751624319289


 73%|███████▎  | 22/30 [02:32<00:55,  6.90s/it]

0.8617386288113065


 77%|███████▋  | 23/30 [02:39<00:48,  6.89s/it]

0.8615568280220032


 80%|████████  | 24/30 [02:46<00:41,  6.89s/it]

0.8594565921359592


 83%|████████▎ | 25/30 [02:53<00:34,  6.95s/it]

0.8580954339769151


 87%|████████▋ | 26/30 [03:00<00:27,  6.93s/it]

0.8532353242238363


 90%|█████████ | 27/30 [03:07<00:20,  6.92s/it]

0.8526822924613953


 93%|█████████▎| 28/30 [03:13<00:13,  6.91s/it]

0.8485198219617208


 97%|█████████▋| 29/30 [03:20<00:06,  6.91s/it]

0.8443891141149733


100%|██████████| 30/30 [03:27<00:00,  6.93s/it]

0.8480482300122579
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [33]:
model_score2

{'Bleu_1': 0.5727106227105965,
 'Bleu_2': 0.4354408749041239,
 'Bleu_3': 0.35167035554283893,
 'Bleu_4': 0.2944031251715931,
 'METEOR': 0.25136053607690845,
 'ROUGE_L': 0.4667369091779589,
 'CIDEr': 1.5261033554728407,
 'SPICE': 0.3236098983222966,
 'USC_similarity': 0.5529305374878447}

In [34]:
caption_model3, model_score3 = cross_validation(cv[2][0], cv[2][1], 3)    

Split 3:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 2/8333 [00:00<07:12, 19.28it/s]

preprocessed words 2714 ==> 890
The vocabulary size is 891.
800 out of 891 words are found in the pre-trained matrix.
The size of embedding_matrix is (891, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [06:47<00:00, 20.43it/s]
  0%|          | 2/2083 [00:00<01:46, 19.47it/s]


Generating set took: 0:06:47.87


100%|██████████| 2083/2083 [01:46<00:00, 19.55it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:01:46.57
Training...


  3%|▎         | 1/30 [00:06<03:22,  6.97s/it]

10.009224361843533


  7%|▋         | 2/30 [00:13<03:14,  6.94s/it]

5.122793250613743


 10%|█         | 3/30 [00:20<03:06,  6.92s/it]

4.137204302681817


 13%|█▎        | 4/30 [00:27<02:59,  6.91s/it]

3.338873121473524


 17%|█▋        | 5/30 [00:34<02:52,  6.91s/it]

2.864495833714803


 20%|██        | 6/30 [00:41<02:45,  6.90s/it]

2.578987068600125


 23%|██▎       | 7/30 [00:48<02:38,  6.89s/it]

2.3770664003160267


 27%|██▋       | 8/30 [00:55<02:31,  6.89s/it]

2.208201011021932


 30%|███       | 9/30 [01:02<02:24,  6.89s/it]

2.0665295256508722


 33%|███▎      | 10/30 [01:08<02:17,  6.88s/it]

1.9416842593087091


 37%|███▋      | 11/30 [01:15<02:10,  6.89s/it]

1.8371803628073797


 40%|████      | 12/30 [01:22<02:03,  6.89s/it]

1.7597700754801433


 43%|████▎     | 13/30 [01:29<01:57,  6.88s/it]

1.6790269480811224


 47%|████▋     | 14/30 [01:36<01:50,  6.90s/it]

1.6162681579589844


 50%|█████     | 15/30 [01:43<01:43,  6.91s/it]

1.5433639155493841


 53%|█████▎    | 16/30 [01:50<01:36,  6.90s/it]

1.4865483575397067


 57%|█████▋    | 17/30 [01:57<01:29,  6.90s/it]

1.4204379982418485


 60%|██████    | 18/30 [02:04<01:22,  6.89s/it]

1.386177036497328


 63%|██████▎   | 19/30 [02:10<01:15,  6.90s/it]

1.3580048481623332


 67%|██████▋   | 20/30 [02:17<01:08,  6.90s/it]

1.3062493138843112


 70%|███████   | 21/30 [02:24<01:02,  6.90s/it]

1.2653244601355658


 73%|███████▎  | 22/30 [02:31<00:55,  6.90s/it]

1.2194546063741047


 77%|███████▋  | 23/30 [02:38<00:48,  6.91s/it]

1.1823198000590007


 80%|████████  | 24/30 [02:45<00:41,  6.90s/it]

1.1426656113730536


 83%|████████▎ | 25/30 [02:52<00:34,  6.91s/it]

1.117363366815779


 87%|████████▋ | 26/30 [02:59<00:27,  6.90s/it]

1.0933848818143208


 90%|█████████ | 27/30 [03:06<00:20,  6.89s/it]

1.1033528513378568


 93%|█████████▎| 28/30 [03:13<00:13,  6.90s/it]

1.0818491379419963


 97%|█████████▋| 29/30 [03:19<00:06,  6.90s/it]

1.071883002916972


100%|██████████| 30/30 [03:26<00:00,  6.90s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

1.062307960457272


  3%|▎         | 1/30 [00:06<03:21,  6.96s/it]

1.0603861278957791


  7%|▋         | 2/30 [00:13<03:14,  6.96s/it]

1.0287372337447271


 10%|█         | 3/30 [00:20<03:07,  6.96s/it]

0.9927251206503974


 13%|█▎        | 4/30 [00:27<03:00,  6.96s/it]

0.9746949937608507


 17%|█▋        | 5/30 [00:34<02:53,  6.95s/it]

0.9572621583938599


 20%|██        | 6/30 [00:41<02:46,  6.94s/it]

0.947038299507565


 23%|██▎       | 7/30 [00:48<02:39,  6.95s/it]

0.9339333838886685


 27%|██▋       | 8/30 [00:55<02:33,  6.96s/it]

0.9336269564098783


 30%|███       | 9/30 [01:02<02:26,  6.95s/it]

0.9255524476369222


 33%|███▎      | 10/30 [01:09<02:19,  6.95s/it]

0.9241031077173021


 37%|███▋      | 11/30 [01:16<02:12,  6.95s/it]

0.9178730381859673


 40%|████      | 12/30 [01:23<02:05,  6.95s/it]

0.9137288199530708


 43%|████▎     | 13/30 [01:30<01:58,  6.94s/it]

0.9091359641816881


 47%|████▋     | 14/30 [01:37<01:50,  6.93s/it]

0.9099119702974955


 50%|█████     | 15/30 [01:44<01:43,  6.93s/it]

0.9044943650563558


 53%|█████▎    | 16/30 [01:51<01:36,  6.92s/it]

0.8995612793498569


 57%|█████▋    | 17/30 [01:57<01:29,  6.92s/it]

0.8977842330932617


 60%|██████    | 18/30 [02:04<01:23,  6.92s/it]

0.8927154739697775


 63%|██████▎   | 19/30 [02:11<01:16,  6.91s/it]

0.8921501504050361


 67%|██████▋   | 20/30 [02:18<01:09,  6.91s/it]

0.8907440702120463


 70%|███████   | 21/30 [02:25<01:02,  6.91s/it]

0.8888357414139642


 73%|███████▎  | 22/30 [02:32<00:55,  6.91s/it]

0.8843492335743375


 77%|███████▋  | 23/30 [02:39<00:48,  6.91s/it]

0.8843187756008573


 80%|████████  | 24/30 [02:46<00:41,  6.91s/it]

0.8812879787551032


 83%|████████▎ | 25/30 [02:53<00:34,  6.92s/it]

0.8774661488003201


 87%|████████▋ | 26/30 [03:00<00:27,  6.93s/it]

0.8727009031507704


 90%|█████████ | 27/30 [03:07<00:20,  6.94s/it]

0.8732887903849283


 93%|█████████▎| 28/30 [03:14<00:13,  6.93s/it]

0.8704471124543084


 97%|█████████▋| 29/30 [03:21<00:06,  6.94s/it]

0.8697449035114713


100%|██████████| 30/30 [03:28<00:00,  6.94s/it]

0.8686975704299079
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [35]:
model_score3

{'Bleu_1': 0.568877901387146,
 'Bleu_2': 0.43487625247907197,
 'Bleu_3': 0.35246948390339866,
 'Bleu_4': 0.2954909729194779,
 'METEOR': 0.25135037117289494,
 'ROUGE_L': 0.4674069991967222,
 'CIDEr': 1.5233682488010207,
 'SPICE': 0.3182274068445491,
 'USC_similarity': 0.5458094136321768}

In [36]:
caption_model4, model_score4 = cross_validation(cv[3][0], cv[3][1], 4)    

Split 4:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 2/8333 [00:00<07:32, 18.41it/s]

preprocessed words 2680 ==> 894
The vocabulary size is 895.
806 out of 895 words are found in the pre-trained matrix.
The size of embedding_matrix is (895, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [06:46<00:00, 20.52it/s]
  0%|          | 3/2083 [00:00<01:40, 20.67it/s]


Generating set took: 0:06:46.16


100%|██████████| 2083/2083 [01:39<00:00, 21.03it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:01:39.03
Training...


  3%|▎         | 1/30 [00:06<03:21,  6.95s/it]

8.974142816331652


  7%|▋         | 2/30 [00:13<03:14,  6.93s/it]

5.231632656521267


 10%|█         | 3/30 [00:20<03:06,  6.91s/it]

4.570508744981554


 13%|█▎        | 4/30 [00:27<02:59,  6.91s/it]

3.879414611392551


 17%|█▋        | 5/30 [00:34<02:52,  6.91s/it]

3.3782754209306507


 20%|██        | 6/30 [00:41<02:45,  6.90s/it]

2.950385888417562


 23%|██▎       | 7/30 [00:48<02:38,  6.89s/it]

2.62725838025411


 27%|██▋       | 8/30 [00:55<02:31,  6.89s/it]

2.40332735909356


 30%|███       | 9/30 [01:02<02:24,  6.88s/it]

2.2220963372124567


 33%|███▎      | 10/30 [01:08<02:17,  6.88s/it]

2.08322450849745


 37%|███▋      | 11/30 [01:15<02:10,  6.87s/it]

1.9821547004911635


 40%|████      | 12/30 [01:22<02:03,  6.88s/it]

1.8645620743433635


 43%|████▎     | 13/30 [01:29<01:57,  6.89s/it]

1.7751616107092962


 47%|████▋     | 14/30 [01:36<01:50,  6.91s/it]

1.692998515235053


 50%|█████     | 15/30 [01:43<01:43,  6.91s/it]

1.6251164807213678


 53%|█████▎    | 16/30 [01:50<01:37,  6.99s/it]

1.5582092470592923


 57%|█████▋    | 17/30 [01:57<01:30,  6.97s/it]

1.5060304800669353


 60%|██████    | 18/30 [02:04<01:23,  6.94s/it]

1.4621033271153767


 63%|██████▎   | 19/30 [02:11<01:16,  6.93s/it]

1.4104154242409601


 67%|██████▋   | 20/30 [02:18<01:09,  6.93s/it]

1.3665718370013766


 70%|███████   | 21/30 [02:25<01:02,  6.92s/it]

1.3257186147901747


 73%|███████▎  | 22/30 [02:32<00:55,  6.90s/it]

1.2963824272155762


 77%|███████▋  | 23/30 [02:38<00:48,  6.90s/it]

1.2600712180137634


 80%|████████  | 24/30 [02:45<00:41,  6.89s/it]

1.2243206765916612


 83%|████████▎ | 25/30 [02:52<00:34,  6.89s/it]

1.1884605884552002


 87%|████████▋ | 26/30 [02:59<00:27,  6.88s/it]

1.1561332146326702


 90%|█████████ | 27/30 [03:06<00:20,  6.89s/it]

1.130963881810506


 93%|█████████▎| 28/30 [03:13<00:13,  6.89s/it]

1.0941949751642015


 97%|█████████▋| 29/30 [03:20<00:06,  6.88s/it]

1.0778364605373807


100%|██████████| 30/30 [03:27<00:00,  6.90s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

1.0517794953452215


  3%|▎         | 1/30 [00:06<03:20,  6.93s/it]

1.0200720230738323


  7%|▋         | 2/30 [00:13<03:13,  6.93s/it]

1.0033240980572171


 10%|█         | 3/30 [00:20<03:06,  6.92s/it]

0.9868520432048373


 13%|█▎        | 4/30 [00:27<03:00,  6.93s/it]

0.9736303819550408


 17%|█▋        | 5/30 [00:34<02:53,  6.94s/it]

0.9705318146281772


 20%|██        | 6/30 [00:41<02:46,  6.93s/it]

0.9597172869576348


 23%|██▎       | 7/30 [00:48<02:38,  6.91s/it]

0.9558127787378099


 27%|██▋       | 8/30 [00:55<02:31,  6.90s/it]

0.9504404134220548


 30%|███       | 9/30 [01:02<02:24,  6.89s/it]

0.9447444942262437


 33%|███▎      | 10/30 [01:09<02:17,  6.90s/it]

0.9454297025998434


 37%|███▋      | 11/30 [01:15<02:10,  6.89s/it]

0.9403669370545281


 40%|████      | 12/30 [01:22<02:03,  6.88s/it]

0.9395126634173923


 43%|████▎     | 13/30 [01:29<01:57,  6.88s/it]

0.9357870022455851


 47%|████▋     | 14/30 [01:36<01:49,  6.87s/it]

0.9318214257558187


 50%|█████     | 15/30 [01:43<01:43,  6.88s/it]

0.9284549752871195


 53%|█████▎    | 16/30 [01:50<01:36,  6.87s/it]

0.924016104804145


 57%|█████▋    | 17/30 [01:57<01:29,  6.87s/it]

0.9225292603174845


 60%|██████    | 18/30 [02:04<01:22,  6.87s/it]

0.9239960047933791


 63%|██████▎   | 19/30 [02:10<01:15,  6.87s/it]

0.920799003707038


 67%|██████▋   | 20/30 [02:17<01:08,  6.87s/it]

0.917277197043101


 70%|███████   | 21/30 [02:24<01:01,  6.87s/it]

0.9158801237742106


 73%|███████▎  | 22/30 [02:31<00:54,  6.86s/it]

0.9101793567339579


 77%|███████▋  | 23/30 [02:38<00:48,  6.86s/it]

0.9094036751323276


 80%|████████  | 24/30 [02:45<00:41,  6.86s/it]

0.908803231186337


 83%|████████▎ | 25/30 [02:52<00:34,  6.88s/it]

0.90596385134591


 87%|████████▋ | 26/30 [02:59<00:27,  6.88s/it]

0.9049005309740702


 90%|█████████ | 27/30 [03:05<00:20,  6.89s/it]

0.9027675257788764


 93%|█████████▎| 28/30 [03:12<00:13,  6.89s/it]

0.9002742634879218


 97%|█████████▋| 29/30 [03:19<00:06,  6.88s/it]

0.8994890848795573


100%|██████████| 30/30 [03:26<00:00,  6.89s/it]

0.8972934219572279
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [37]:
model_score4

{'Bleu_1': 0.561765368160299,
 'Bleu_2': 0.42872225525349267,
 'Bleu_3': 0.34845855173243523,
 'Bleu_4': 0.29255474607304743,
 'METEOR': 0.2513353697582052,
 'ROUGE_L': 0.47201045642924544,
 'CIDEr': 1.5258784502251954,
 'SPICE': 0.3148959241857352,
 'USC_similarity': 0.5413947349371308}

In [38]:
caption_model5, model_score5 = cross_validation(cv[4][0], cv[4][1], 5)    

Split 5:
Splitting data...
8333 images for training and 2083 images for testing.
There are 41665 captions


  0%|          | 2/8333 [00:00<07:25, 18.71it/s]

preprocessed words 2657 ==> 905
The vocabulary size is 906.
815 out of 906 words are found in the pre-trained matrix.
The size of embedding_matrix is (906, 500)
Preparing dataloader...


100%|██████████| 8333/8333 [06:40<00:00, 20.81it/s]
  0%|          | 3/2083 [00:00<01:36, 21.50it/s]


Generating set took: 0:06:40.52


100%|██████████| 2083/2083 [01:40<00:00, 20.63it/s]
  0%|          | 0/30 [00:00<?, ?it/s]


Generating set took: 0:01:40.96
Training...


  3%|▎         | 1/30 [00:06<03:21,  6.93s/it]

9.892503102620443


  7%|▋         | 2/30 [00:13<03:14,  6.93s/it]

5.4908783170912


 10%|█         | 3/30 [00:20<03:07,  6.94s/it]

5.022943019866943


 13%|█▎        | 4/30 [00:27<02:59,  6.92s/it]

4.67819356918335


 17%|█▋        | 5/30 [00:34<02:52,  6.91s/it]

4.3150853051079645


 20%|██        | 6/30 [00:41<02:45,  6.91s/it]

3.9263397322760687


 23%|██▎       | 7/30 [00:48<02:39,  6.91s/it]

3.468002133899265


 27%|██▋       | 8/30 [00:55<02:31,  6.91s/it]

3.0108596748775907


 30%|███       | 9/30 [01:02<02:25,  6.90s/it]

2.67328323258294


 33%|███▎      | 10/30 [01:09<02:17,  6.90s/it]

2.4531209998660617


 37%|███▋      | 11/30 [01:15<02:11,  6.90s/it]

2.280022064844767


 40%|████      | 12/30 [01:22<02:04,  6.90s/it]

2.127020650439792


 43%|████▎     | 13/30 [01:29<01:57,  6.90s/it]

2.006263322300381


 47%|████▋     | 14/30 [01:36<01:50,  6.89s/it]

1.8996819522645738


 50%|█████     | 15/30 [01:43<01:43,  6.89s/it]

1.8081707954406738


 53%|█████▎    | 16/30 [01:50<01:36,  6.90s/it]

1.7313230832417805


 57%|█████▋    | 17/30 [01:57<01:29,  6.89s/it]

1.6486842234929402


 60%|██████    | 18/30 [02:04<01:22,  6.90s/it]

1.5865011745029025


 63%|██████▎   | 19/30 [02:11<01:16,  6.92s/it]

1.538724766837226


 67%|██████▋   | 20/30 [02:18<01:09,  6.91s/it]

1.4882257382074993


 70%|███████   | 21/30 [02:25<01:02,  6.91s/it]

1.4519944190979004


 73%|███████▎  | 22/30 [02:31<00:55,  6.90s/it]

1.416047559844123


 77%|███████▋  | 23/30 [02:38<00:48,  6.91s/it]

1.3829259607526991


 80%|████████  | 24/30 [02:45<00:41,  6.91s/it]

1.3435089588165283


 83%|████████▎ | 25/30 [02:52<00:34,  6.93s/it]

1.2891622516844008


 87%|████████▋ | 26/30 [02:59<00:27,  6.93s/it]

1.2586100763744779


 90%|█████████ | 27/30 [03:06<00:20,  6.93s/it]

1.2217828565173678


 93%|█████████▎| 28/30 [03:13<00:13,  6.92s/it]

1.206194069650438


 97%|█████████▋| 29/30 [03:20<00:06,  6.93s/it]

1.1972069011794195


100%|██████████| 30/30 [03:27<00:00,  6.91s/it]
  0%|          | 0/30 [00:00<?, ?it/s]

1.1808440486590068


  3%|▎         | 1/30 [00:06<03:19,  6.88s/it]

1.1811088191138372


  7%|▋         | 2/30 [00:13<03:12,  6.88s/it]

1.1575854751798842


 10%|█         | 3/30 [00:20<03:08,  6.98s/it]

1.1227729188071356


 13%|█▎        | 4/30 [00:27<03:00,  6.96s/it]

1.1054353713989258


 17%|█▋        | 5/30 [00:34<02:53,  6.94s/it]

1.0823719435267978


 20%|██        | 6/30 [00:41<02:45,  6.92s/it]

1.0725906425052218


 23%|██▎       | 7/30 [00:48<02:38,  6.91s/it]

1.0611874792310927


 27%|██▋       | 8/30 [00:55<02:31,  6.91s/it]

1.0536882678667705


 30%|███       | 9/30 [01:02<02:24,  6.90s/it]

1.046147346496582


 33%|███▎      | 10/30 [01:09<02:18,  6.90s/it]

1.0471873415840998


 37%|███▋      | 11/30 [01:16<02:11,  6.90s/it]

1.038063155280219


 40%|████      | 12/30 [01:23<02:04,  6.90s/it]

1.0351325538423326


 43%|████▎     | 13/30 [01:29<01:57,  6.90s/it]

1.0336465305752225


 47%|████▋     | 14/30 [01:36<01:50,  6.89s/it]

1.0305094917615254


 50%|█████     | 15/30 [01:43<01:43,  6.89s/it]

1.0235816968811884


 53%|█████▎    | 16/30 [01:50<01:36,  6.88s/it]

1.020192735724979


 57%|█████▋    | 17/30 [01:57<01:29,  6.89s/it]

1.018860896428426


 60%|██████    | 18/30 [02:04<01:22,  6.90s/it]

1.0151818725797865


 63%|██████▎   | 19/30 [02:11<01:15,  6.90s/it]

1.0100090503692627


 67%|██████▋   | 20/30 [02:18<01:08,  6.90s/it]

1.0056979589992099


 70%|███████   | 21/30 [02:25<01:02,  6.91s/it]

1.0071354177263048


 73%|███████▎  | 22/30 [02:31<00:55,  6.90s/it]

1.0043531854947407


 77%|███████▋  | 23/30 [02:38<00:48,  6.91s/it]

1.003518237007989


 80%|████████  | 24/30 [02:45<00:41,  6.92s/it]

1.0016614529821608


 83%|████████▎ | 25/30 [02:52<00:34,  6.92s/it]

0.999110758304596


 87%|████████▋ | 26/30 [02:59<00:27,  6.93s/it]

0.9949103196461996


 90%|█████████ | 27/30 [03:06<00:20,  6.92s/it]

0.9911640948719449


 93%|█████████▎| 28/30 [03:13<00:13,  6.93s/it]

0.9881393975681729


 97%|█████████▋| 29/30 [03:20<00:06,  6.92s/it]

0.9881929357846578


100%|██████████| 30/30 [03:27<00:00,  6.91s/it]

0.9848316974110074
Generating captions...





tokenization...
computing Bleu score...
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...
computing Universal_Sentence_Encoder_Similarity score...


In [39]:
model_score5

{'Bleu_1': 0.5530517288568308,
 'Bleu_2': 0.419674710207904,
 'Bleu_3': 0.3380617028296787,
 'Bleu_4': 0.2810167137078612,
 'METEOR': 0.2414466573300259,
 'ROUGE_L': 0.4574034941925339,
 'CIDEr': 1.4262193915182797,
 'SPICE': 0.3022258218454784,
 'USC_similarity': 0.5345895438772044}

In [40]:
model_scores = defaultdict(list)
for scores in [model_score1, model_score2, model_score3, model_score4, model_score5]:
    for key, value in scores.items():
        model_scores[key].append(value)

In [41]:
model_scores

defaultdict(list,
            {'Bleu_1': [0.5534123965092614,
              0.5727106227105965,
              0.568877901387146,
              0.561765368160299,
              0.5530517288568308],
             'Bleu_2': [0.4179403265571349,
              0.4354408749041239,
              0.43487625247907197,
              0.42872225525349267,
              0.419674710207904],
             'Bleu_3': [0.3352368897321505,
              0.35167035554283893,
              0.35246948390339866,
              0.34845855173243523,
              0.3380617028296787],
             'Bleu_4': [0.2783509536825586,
              0.2944031251715931,
              0.2954909729194779,
              0.29255474607304743,
              0.2810167137078612],
             'METEOR': [0.2455818221305221,
              0.25136053607690845,
              0.25135037117289494,
              0.2513353697582052,
              0.2414466573300259],
             'ROUGE_L': [0.45918757045295083,
              0.4667369091

In [42]:
tag = '16.1'
with open(f'{root_captioning}/fz_notebooks/cv_n{tag}.json', 'w') as fp:
    json.dump(model_scores, fp)