In [1]:
import os
import json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate import bleu_score
from sklearn.metrics.pairwise import cosine_similarity

START = "startseq"
STOP = "endseq"
EPOCHS = 10
AWS = True

In [2]:
torch.manual_seed(123)
np.random.seed(123)
# torch.manual_seed(23964)
# np.random.seed(6457)

In [3]:
# torch.cuda.empty_cache()
# import gc 
# gc.collect()

The following function is used to nicely format elapsed times.

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"

### Google CoLab or AWS

In [5]:
if AWS:
    root_captioning = "../../data"
else:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        root_captioning = "/content/drive/My Drive/data"
        COLAB = True
        print("Note: using Google CoLab")
    except:
        print("Note: not using Google CoLab")
        COLAB = False

### Clean/Build Dataset

- Read captions
- Preprocess captions


In [6]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [7]:
# get img path and caption list
# only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)

      
lex = set()
for sen in train_descriptions:
    [lex.update(d.split()) for d in sen]

for sen in test_descriptions:
    [lex.update(d.split()) for d in sen]


Stats on what was collected.

In [8]:
print(len(train_descriptions)) # How many images? 
print(len(test_descriptions)) # How many images? 
print(len(lex)) # How many unique words (vocab)
print(max_length) # Maximum length of a caption (in words)


8332
2084
2912
34


Display the size of the train and test sets.

In [9]:
print(len(train_paths))
print(len(test_paths))

8332
2084


In [10]:
train_paths[0]

'../../data/train/ucm_1080.jpg'

Build the sequences.  We include a **start** and **stop** token at the beginning/end.  We will later use the **start** token to begin the process of generating a caption.  Encountering the **stop** token in the generated text will let us know we are done.

In [11]:
for v in train_descriptions: 
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'

See how many discriptions were extracted.

In [12]:
train_descriptions[0]

['startseq Lots of boats docked at the harbor and the boats are closed to each other . endseq',
 'startseq Lots of boats docked neatly at the harbor . endseq',
 'startseq Many boats docked neatly at the harbor and the water is deep blue . endseq',
 'startseq Many boats docked neatly at the harbor and some positions are free . endseq',
 'startseq Lots of boats docked neatly at the harbor and the boats are closed to each other . endseq']

In [13]:
all_train_captions = []
for val in train_descriptions:
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

41660

In [14]:
all_train_captions[0]

'startseq Lots of boats docked at the harbor and the boats are closed to each other . endseq'

Words that do not occur very often can be misleading to neural network training.  It is better to simply remove such words.  Here we remove any words that occur less than 10 times.  We display what the total vocabulary shrunk to.

In [15]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))

preprocessed words 2704 ==> 901


Next we build two lookup tables for this vocabulary. One idxtoword convers index numbers to actual words to index values.  The wordtoidx lookup table performs the opposit.

In [16]:
idxtoword = {}
wordtoidx = {}

ix = 1
for w in vocab:
    wordtoidx[w] = ix
    idxtoword[ix] = w
    ix += 1
    
vocab_size = len(idxtoword) + 1 
vocab_size

902

Previously we added a start and stop token to all sentences.  We must account for this in the maximum length of captions.

In [17]:
max_length +=2
print(max_length)

36


### Loading Wikipedia2vec Embeddings

In [18]:
# # read the embedding matrix 
# with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
#     embeddings_index = json.load(file)

In [19]:
# embedding_dim = 500

# # Get 200-dim dense vector for each of the 10000 words in out vocabulary
# embedding_matrix = np.zeros((vocab_size, embedding_dim))
# count =0
# for word, i in wordtoidx.items():
#     #if i < max_words:
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         count += 1
#         # Words not found in the embedding index will be all zeros
#         embedding_matrix[i] = np.array(embedding_vector)

# print(f'{count} out of {vocab_size} words are found in the pre-trained matrix.')

In [20]:
# embedding_matrix.shape

### Loading Glove Embeddings

In [21]:
embeddings_index = {} 
path = os.path.join(root_captioning, 'glove.6B.200d.txt') if AWS\
else os.path.join(root_captioning, 'raw', 'glove.6B.200d.txt')

f = open(
    path, 
    encoding="utf-8"
)

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:22, 18129.87it/s]

Found 400000 word vectors.





In [22]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoidx.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [23]:
embedding_matrix.shape

(902, 200)

### Building the Neural Network

An embedding matrix is built from Glove.  This will be directly copied to the weight matrix of the neural network.

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [25]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()

        # inception v3 expects (299, 299) sized images
        self.model = models.inception_v3(pretrained=pretrained, aux_logits=False)
        # remove the classification layer
        self.model =\
        nn.Sequential(
            *(list(self.model.children())[: 3]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[3: 5]),
            nn.MaxPool2d(kernel_size=3, stride=2),
            *(list(self.model.children())[5: -1])
        )

        self.input_size = 299

    def forward(self, img_input, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
        if not train:
          # set the model to evaluation model
          self.model.eval()

        # N x 3 x 299 x 299
        features = self.model(img_input)
        # N x 2048 x 8 x 8

        return features

In [26]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
                'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.weight.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [27]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

### Train the Neural Network

In [28]:
def train(model, iterator, optimizer, criterion, clip):
    """
    train the CaptionModel

    Parameters:
    -----------
    model: CaptionModel
        a CaptionModel instance
    iterator: torch.utils.data.dataloader
        a PyTorch dataloader
    optimizer: torch.optim
        a PyTorch optimizer 
    criterion: nn.CrossEntropyLoss
        a PyTorch criterion 

    Return:
    --------
    float
        average loss
    """
    model.train()    
    epoch_loss = 0
    
    for img_features, captions in iterator:
        
        optimizer.zero_grad()

        # for each caption, the end word is not passed for training
        outputs = model(
            img_features.to(device),
            captions[:, :-1].to(device)
        )

        loss = criterion(
            outputs.view(-1, vocab_size), 
            captions[:, 1:].flatten().to(device)
        )
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
    return epoch_loss / len(iterator)

In [29]:
class SampleDataset(Dataset):
    def __init__(
        self,
        descriptions,
        imgs,
        wordtoidx,
        max_length
    ):
        """
        Initializes a SampleDataset

        Parameters:
        -----------
        descriptions: list
            a list of captions
        imgs: numpy.ndarray
            the image features
        wordtoidx: dict
            the dict to get word index
        max_length: int
            all captions will be padded to this size
        """        
        self.imgs = imgs
        self.descriptions = descriptions
        self.wordtoidx = wordtoidx
        self.max_length = max_length

    def __len__(self):
        """
        Returns the batch size

        Return:
        --------
        int
            the batch size
        """
        # return len(self.descriptions)
        return len(self.imgs)

    def __getitem__(self, idx):
        """
        Prepare data for each image

        Parameters:
        -----------
        idx: int
          the index of the image to process

        Return:
        --------
        list, list, list
            [5 x image feature matrix],
            [five padded captions for this image]
            [the length of each caption]
        """

        img = self.imgs[idx // 5]
        # convert each word into a list of sequences.
        seq = [self.wordtoidx[word] for word 
               in self.descriptions[idx // 5][idx % 5].split(' ')
               if word in self.wordtoidx]
        # pad the sequence with 0 on the right side
        in_seq = np.pad(
            seq, 
            (0, max_length - len(seq)),
            mode='constant',
            constant_values=(0, 0)
            )

        return img, in_seq


In [30]:
def init_weights(model, embedding_pretrained=True):
    """
    Initialize weights and bias in the model

    Parameters:
    -----------
    model: CaptionModel
      a CaptionModel instance
    embedding_pretrained: bool (default: True)
        not initialize the embedding matrix if True
    """  
  
    for name, param in model.named_parameters():
        if embedding_pretrained and 'embedding' in name:
            continue
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


In [31]:
def encode_image(model, img_path):
    """
    Process the images to extract features

    Parameters:
    -----------
    model: CNNModel
      a CNNModel instance
    img_path: str
        the path of the image
 
    Return:
    --------
    torch.Tensor
        the extracted feature matrix from CNNModel
    """  

    img = Image.open(img_path)

    # Perform preprocessing needed by pre-trained models
    preprocessor = transforms.Compose([
        transforms.Resize(model.input_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    img = preprocessor(img)
    # Expand to 2D array
    img = img.view(1, *img.shape)
    # Call model to extract the smaller feature set for the image.
    x = model(img.to(device), False) 
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.squeeze(x)
    return x

In [32]:
def extract_img_features(img_paths, file_path, model=None):
    """
    Extracts, stores and returns image features

    Parameters:
    -----------
    img_paths: list
        the paths of images
    file_path: str
        the path to store the results
    model: CNNModel (default: None)
      a CNNModel instance

    Return:
    --------
    numpy.ndarray
        the extracted image feature matrix from CNNModel
    """ 

    if not os.path.exists(file_path):
        if model is None:
            raise Exception("Please pass a CNNModel instance.")
        start = time()
        img_features = []
        
        for image_path in tqdm(img_paths):
            
            img_features.append(
                F.adaptive_avg_pool2d(
                    (encode_image(model, image_path).cpu()), 
                    (1, 1)
                ).squeeze().data.numpy()
            )
      
        with open(file_path, "wb") as fp:
            pickle.dump(img_features, fp)

        print(f"\nGenerating set took: {hms_string(time()-start)}")

    else:
        with open(file_path, "rb") as fp:
            img_features = pickle.load(fp)

    return img_features

In [33]:
# encoder = CNNModel(pretrained=True)
# encoder.to(device)

In [34]:
train_img_features = extract_img_features(
    train_paths,
    f'{root_captioning}/train_full_9.3.pkl',
#     encoder
)

In [35]:
test_img_features = extract_img_features(
    test_paths,
    f'{root_captioning}/test_full_9.3.pkl',
#     encoder
)

In [36]:
cnn_type = 'inception_v3'
# cnn_type = 'vgg16'

caption_model_1 = CaptionModel(
    cnn_type, 
    vocab_size, 
    embedding_dim=200, 
    hidden_size=256,
    embedding_matrix=None, 
    embedding_train=True
)


caption_model_1.to(device)

CaptionModel(
  (decoder): RNNModel(
    (embedding): Embedding(902, 200, padding_idx=0)
    (dropout): Dropout(p=0.5, inplace=False)
    (lstm): LSTM(200, 256, batch_first=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (dense1): Linear(in_features=2048, out_features=256, bias=True)
  (relu1): ReLU()
  (dense2): Linear(in_features=256, out_features=256, bias=True)
  (relu2): ReLU()
  (dense3): Linear(in_features=256, out_features=902, bias=True)
)

In [37]:
caption_model_2 = CaptionModel(
    cnn_type, 
    vocab_size, 
    embedding_dim=embedding_dim, 
    hidden_size=256,
    embedding_matrix=embedding_matrix, 
    embedding_train=True
)

caption_model_2.to(device)

CaptionModel(
  (decoder): RNNModel(
    (embedding): Embedding(902, 200, padding_idx=0)
    (dropout): Dropout(p=0.5, inplace=False)
    (lstm): LSTM(200, 256, batch_first=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (dense1): Linear(in_features=2048, out_features=256, bias=True)
  (relu1): ReLU()
  (dense2): Linear(in_features=256, out_features=256, bias=True)
  (relu2): ReLU()
  (dense3): Linear(in_features=256, out_features=902, bias=True)
)

In [38]:
init_weights(
    caption_model_1,
    embedding_pretrained=False
)

## we will ignore the pad token in true target set
criterion = nn.CrossEntropyLoss(ignore_index = 0)

optimizer = torch.optim.Adam(
    caption_model_1.parameters(), 
    lr=0.01
)

train_dataset = SampleDataset(
    train_descriptions,
    train_img_features,
    wordtoidx,
    max_length
)

train_loader = DataLoader(
    train_dataset,
    1000
)

In [39]:
clip = 1

# model_path = f'{root_captioning}/caption-model_pytorch_inc_v3_9.3.hdf5'

# if not os.path.exists(model_path):
if True:
    start = time()

    for i in tqdm(range(EPOCHS * 3)):
        
        loss = train(caption_model_1, train_loader, optimizer, criterion, clip)
        print(loss)
        
    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 3)):

        loss = train(caption_model_1, train_loader, optimizer, criterion, clip)
        print(loss)
        
#     torch.save(caption_model, model_path)
#     print(f"\Training took: {hms_string(time()-start)}")

# else:
#     caption_model = torch.load(model_path)


  3%|▎         | 1/30 [00:00<00:19,  1.52it/s]

6.151164531707764


  7%|▋         | 2/30 [00:01<00:18,  1.54it/s]

4.9246374236212835


 10%|█         | 3/30 [00:01<00:17,  1.56it/s]

4.37503883573744


 13%|█▎        | 4/30 [00:02<00:16,  1.57it/s]

3.90399874581231


 17%|█▋        | 5/30 [00:03<00:15,  1.58it/s]

3.519996404647827


 20%|██        | 6/30 [00:03<00:15,  1.58it/s]

3.154875569873386


 23%|██▎       | 7/30 [00:04<00:14,  1.59it/s]

2.8082154062059193


 27%|██▋       | 8/30 [00:05<00:13,  1.59it/s]

2.537147707409329


 30%|███       | 9/30 [00:05<00:13,  1.60it/s]

2.35441878106859


 33%|███▎      | 10/30 [00:06<00:12,  1.56it/s]

2.210924890306261


 37%|███▋      | 11/30 [00:06<00:12,  1.57it/s]

2.056483268737793


 40%|████      | 12/30 [00:07<00:11,  1.59it/s]

1.9270703925026789


 43%|████▎     | 13/30 [00:08<00:10,  1.60it/s]

1.8229396608140733


 47%|████▋     | 14/30 [00:08<00:09,  1.61it/s]

1.7350531419118245


 50%|█████     | 15/30 [00:09<00:09,  1.62it/s]

1.623209555943807


 53%|█████▎    | 16/30 [00:10<00:08,  1.61it/s]

1.5320977634853787


 57%|█████▋    | 17/30 [00:10<00:08,  1.60it/s]

1.45037399397956


 60%|██████    | 18/30 [00:11<00:07,  1.60it/s]

1.3856452571021185


 63%|██████▎   | 19/30 [00:12<00:07,  1.50it/s]

1.3341154257456462


 67%|██████▋   | 20/30 [00:12<00:07,  1.42it/s]

1.2879682646857367


 70%|███████   | 21/30 [00:13<00:06,  1.39it/s]

1.258163485262129


 73%|███████▎  | 22/30 [00:14<00:05,  1.38it/s]

1.2057121528519525


 77%|███████▋  | 23/30 [00:15<00:05,  1.35it/s]

1.1571252478493586


 80%|████████  | 24/30 [00:15<00:04,  1.29it/s]

1.1040045155419245


 83%|████████▎ | 25/30 [00:16<00:03,  1.33it/s]

1.051502737734053


 87%|████████▋ | 26/30 [00:17<00:02,  1.40it/s]

1.0152224567201402


 90%|█████████ | 27/30 [00:17<00:02,  1.46it/s]

0.9884816077020433


 93%|█████████▎| 28/30 [00:18<00:01,  1.50it/s]

0.9693609144952562


 97%|█████████▋| 29/30 [00:19<00:00,  1.53it/s]

0.9389197892612882


100%|██████████| 30/30 [00:19<00:00,  1.52it/s]
  0%|          | 0/30 [00:00<?, ?it/s]

0.892349534564548


  3%|▎         | 1/30 [00:00<00:17,  1.65it/s]

0.8643178277545505


  7%|▋         | 2/30 [00:01<00:17,  1.63it/s]

0.839093440108829


 10%|█         | 3/30 [00:01<00:16,  1.63it/s]

0.8160745104153951


 13%|█▎        | 4/30 [00:02<00:16,  1.62it/s]

0.803276903099484


 17%|█▋        | 5/30 [00:03<00:15,  1.62it/s]

0.7954957485198975


 20%|██        | 6/30 [00:03<00:14,  1.62it/s]

0.7900225851270888


 23%|██▎       | 7/30 [00:04<00:14,  1.58it/s]

0.7864181796709696


 27%|██▋       | 8/30 [00:05<00:14,  1.50it/s]

0.7854490809970431


 30%|███       | 9/30 [00:05<00:14,  1.49it/s]

0.7795210944281684


 33%|███▎      | 10/30 [00:06<00:13,  1.48it/s]

0.7803593443499671


 37%|███▋      | 11/30 [00:07<00:12,  1.52it/s]

0.7771336634953817


 40%|████      | 12/30 [00:07<00:11,  1.56it/s]

0.7787542045116425


 43%|████▎     | 13/30 [00:08<00:10,  1.58it/s]

0.7744428416093191


 47%|████▋     | 14/30 [00:08<00:10,  1.60it/s]

0.773181762960222


 50%|█████     | 15/30 [00:09<00:09,  1.61it/s]

0.7707406116856469


 53%|█████▎    | 16/30 [00:10<00:08,  1.60it/s]

0.7695128354761336


 57%|█████▋    | 17/30 [00:10<00:08,  1.60it/s]

0.7695378197564019


 60%|██████    | 18/30 [00:11<00:07,  1.56it/s]

0.7665311727258894


 63%|██████▎   | 19/30 [00:12<00:06,  1.59it/s]

0.767546190155877


 67%|██████▋   | 20/30 [00:12<00:06,  1.57it/s]

0.7646115256680382


 70%|███████   | 21/30 [00:13<00:05,  1.59it/s]

0.7651550239986844


 73%|███████▎  | 22/30 [00:13<00:05,  1.59it/s]

0.7630699541833665


 77%|███████▋  | 23/30 [00:14<00:04,  1.59it/s]

0.7631990909576416


 80%|████████  | 24/30 [00:15<00:03,  1.61it/s]

0.7623770468764834


 83%|████████▎ | 25/30 [00:15<00:03,  1.57it/s]

0.7613599863317277


 87%|████████▋ | 26/30 [00:16<00:02,  1.58it/s]

0.760589599609375


 90%|█████████ | 27/30 [00:17<00:01,  1.55it/s]

0.7567144996590085


 93%|█████████▎| 28/30 [00:17<00:01,  1.57it/s]

0.7571767800384097


 97%|█████████▋| 29/30 [00:18<00:00,  1.57it/s]

0.757081194056405


100%|██████████| 30/30 [00:19<00:00,  1.57it/s]

0.7569080822997623





In [40]:
init_weights(
    caption_model_2,
    embedding_pretrained=True
)

## we will ignore the pad token in true target set
criterion = nn.CrossEntropyLoss(ignore_index = 0)

optimizer = torch.optim.Adam(
    caption_model_2.parameters(), 
    lr=0.01
)

train_dataset = SampleDataset(
    train_descriptions,
    train_img_features,
    wordtoidx,
    max_length
)

train_loader = DataLoader(
    train_dataset,
    1000
)

In [41]:
clip = 1

# model_path = f'{root_captioning}/caption-model_pytorch_inc_v3_9.3.hdf5'

# if not os.path.exists(model_path):
if True:
    start = time()

    for i in tqdm(range(EPOCHS * 3)):
        
        loss = train(caption_model_2, train_loader, optimizer, criterion, clip)
        print(loss)
        
    # reduce the learning rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1e-4

    for i in tqdm(range(EPOCHS * 3)):

        loss = train(caption_model_2, train_loader, optimizer, criterion, clip)
        print(loss)
        
#     torch.save(caption_model, model_path)
#     print(f"\Training took: {hms_string(time()-start)}")

# else:
#     caption_model = torch.load(model_path)


  3%|▎         | 1/30 [00:00<00:18,  1.58it/s]

7.026349226633708


  7%|▋         | 2/30 [00:01<00:17,  1.59it/s]

4.9601197772555885


 10%|█         | 3/30 [00:01<00:16,  1.59it/s]

4.5973747041490345


 13%|█▎        | 4/30 [00:02<00:16,  1.60it/s]

4.373509777916802


 17%|█▋        | 5/30 [00:03<00:15,  1.60it/s]

3.957859913508097


 20%|██        | 6/30 [00:03<00:14,  1.60it/s]

3.4489263693491616


 23%|██▎       | 7/30 [00:04<00:14,  1.60it/s]

3.0525982644822864


 27%|██▋       | 8/30 [00:04<00:13,  1.61it/s]

2.7514285246531167


 30%|███       | 9/30 [00:05<00:13,  1.61it/s]

2.506771935356988


 33%|███▎      | 10/30 [00:06<00:12,  1.61it/s]

2.3021128707461886


 37%|███▋      | 11/30 [00:06<00:11,  1.60it/s]

2.1413431432512073


 40%|████      | 12/30 [00:07<00:11,  1.57it/s]

2.022365027003818


 43%|████▎     | 13/30 [00:08<00:11,  1.49it/s]

1.9160572025511


 47%|████▋     | 14/30 [00:08<00:10,  1.52it/s]

1.8122175931930542


 50%|█████     | 15/30 [00:09<00:09,  1.56it/s]

1.7173647085825603


 53%|█████▎    | 16/30 [00:10<00:08,  1.58it/s]

1.6399002605014377


 57%|█████▋    | 17/30 [00:10<00:08,  1.51it/s]

1.56422135565016


 60%|██████    | 18/30 [00:11<00:08,  1.44it/s]

1.4979095194074843


 63%|██████▎   | 19/30 [00:12<00:07,  1.40it/s]

1.4437693225012884


 67%|██████▋   | 20/30 [00:13<00:07,  1.39it/s]

1.4049056768417358


 70%|███████   | 21/30 [00:13<00:06,  1.38it/s]

1.364296509159936


 73%|███████▎  | 22/30 [00:14<00:05,  1.37it/s]

1.32752439710829


 77%|███████▋  | 23/30 [00:15<00:05,  1.35it/s]

1.2900399896833632


 80%|████████  | 24/30 [00:16<00:04,  1.34it/s]

1.256981333096822


 83%|████████▎ | 25/30 [00:16<00:03,  1.30it/s]

1.2290719085269504


 87%|████████▋ | 26/30 [00:17<00:02,  1.34it/s]

1.2057688170009189


 90%|█████████ | 27/30 [00:18<00:02,  1.38it/s]

1.1816046370400324


 93%|█████████▎| 28/30 [00:18<00:01,  1.44it/s]

1.141560243235694


 97%|█████████▋| 29/30 [00:19<00:00,  1.50it/s]

1.1081683900621202


100%|██████████| 30/30 [00:20<00:00,  1.49it/s]
  0%|          | 0/30 [00:00<?, ?it/s]

1.0810852977964613


  3%|▎         | 1/30 [00:00<00:18,  1.60it/s]

1.0536504652765062


  7%|▋         | 2/30 [00:01<00:17,  1.60it/s]

1.029432521926032


 10%|█         | 3/30 [00:01<00:16,  1.61it/s]

1.0138022303581238


 13%|█▎        | 4/30 [00:02<00:16,  1.62it/s]

1.0014275776015387


 17%|█▋        | 5/30 [00:03<00:15,  1.62it/s]

0.9926450252532959


 20%|██        | 6/30 [00:03<00:14,  1.63it/s]

0.9901778830422295


 23%|██▎       | 7/30 [00:04<00:14,  1.63it/s]

0.988030195236206


 27%|██▋       | 8/30 [00:04<00:13,  1.62it/s]

0.9834972553782992


 30%|███       | 9/30 [00:05<00:13,  1.58it/s]

0.9811401698324416


 33%|███▎      | 10/30 [00:06<00:12,  1.60it/s]

0.9793292350239224


 37%|███▋      | 11/30 [00:06<00:11,  1.61it/s]

0.978055112891727


 40%|████      | 12/30 [00:07<00:11,  1.62it/s]

0.9795799718962775


 43%|████▎     | 13/30 [00:08<00:10,  1.62it/s]

0.976159950097402


 47%|████▋     | 14/30 [00:08<00:10,  1.54it/s]

0.9729796118206449


 50%|█████     | 15/30 [00:09<00:09,  1.57it/s]

0.9748335414462619


 53%|█████▎    | 16/30 [00:09<00:08,  1.59it/s]

0.9729436702198453


 57%|█████▋    | 17/30 [00:10<00:08,  1.57it/s]

0.9708554281128777


 60%|██████    | 18/30 [00:11<00:07,  1.58it/s]

0.9702145324812995


 63%|██████▎   | 19/30 [00:11<00:06,  1.60it/s]

0.9694207774268256


 67%|██████▋   | 20/30 [00:12<00:06,  1.61it/s]

0.9681843386756049


 70%|███████   | 21/30 [00:13<00:05,  1.62it/s]

0.9665178457895914


 73%|███████▎  | 22/30 [00:13<00:04,  1.63it/s]

0.9665931132104661


 77%|███████▋  | 23/30 [00:14<00:04,  1.63it/s]

0.9662931031650968


 80%|████████  | 24/30 [00:14<00:03,  1.63it/s]

0.9656590951813592


 83%|████████▎ | 25/30 [00:15<00:03,  1.60it/s]

0.9650584657986959


 87%|████████▋ | 26/30 [00:16<00:02,  1.61it/s]

0.9627931978967454


 90%|█████████ | 27/30 [00:16<00:01,  1.62it/s]

0.9626719156901041


 93%|█████████▎| 28/30 [00:17<00:01,  1.58it/s]

0.9593049089113871


 97%|█████████▋| 29/30 [00:18<00:00,  1.59it/s]

0.9597794413566589


100%|██████████| 30/30 [00:18<00:00,  1.60it/s]

0.9598200784789191





#### Compare randomly initialized embeddings learned from scratch and pre-trained embeddings trained on captions

In [42]:
word_pairs = [('buildings','pools'), 
              ('ocean', 'beach'), 
              ('trees', 'green'), 
              ('several', 'road'), 
              ('planted', 'mountain'),
              ('bridge', 'river'),
              ('pond', 'desert')
             ]

In [43]:
results = {'word1': [],
           'word2': [],
           'cosine similarity (learned from scratch)': [],
           'cosine similarity (pre-trained)': []}
for word in word_pairs:

    results['word1'].append(word[0])
    results['word2'].append(word[1])

    results['cosine similarity (learned from scratch)']\
    .append(
        np.squeeze(
        cosine_similarity(
            caption_model_1.decoder.embedding.weight.cpu().data.numpy()[wordtoidx[word[0]]].reshape(1, -1),
            caption_model_1.decoder.embedding.weight.cpu().data.numpy()[wordtoidx[word[1]]].reshape(1, -1)
        ))) 
    
    results['cosine similarity (pre-trained)']\
    .append(
        np.squeeze(
        cosine_similarity(
            caption_model_2.decoder.embedding.weight.cpu().data.numpy()[wordtoidx[word[0]]].reshape(1, -1),
            caption_model_2.decoder.embedding.weight.cpu().data.numpy()[wordtoidx[word[1]]].reshape(1, -1)
        )))
   


In [44]:
pd.DataFrame(results)

Unnamed: 0,word1,word2,cosine similarity (learned from scratch),cosine similarity (pre-trained)
0,buildings,pools,0.5774896,0.290734
1,ocean,beach,0.23822954,0.4060871
2,trees,green,-0.15888813,0.087963894
3,several,road,-0.051148847,-0.14882685
4,planted,mountain,-0.11317037,0.11385666
5,bridge,river,0.33074567,0.45866707
6,pond,desert,0.073872365,0.2261277
