## Generate caption for valid set

This notebook is built based on our baseline model pytorch v3 version.

In [1]:
import os
import json
from collections import defaultdict
from tqdm import tqdm
import pickle
from time import time
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from itertools import chain

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms, datasets
from torchsummary import summary
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.translate.bleu_score import sentence_bleu

START = "startseq"
STOP = "endseq"
EPOCHS = 10


## 1. Load dataset

- Read captions
- Preprocess captions

> We need image paths, word lookup tables, and max_length in generating function

In [2]:
root_captioning = "../../s3"

In [4]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
        data = json.load(json_data)
        for filename in data.keys():
            if num is not None and len(caption) == num:
                break
            img_path.append(
                f'{root_captioning}/{name}/{filename}'
            )
            sen_list = []
            for sentence in data[filename]['sentences']:
                max_length = max(max_length, len(sentence['tokens']))
                sen_list.append(sentence['raw'])

            caption.append(sen_list)
    
    return img_path, caption, max_length            


In [9]:
# get img path and caption list
# only test 800 train samples and 200 valid samples
train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)


In [10]:
print(len(train_descriptions)) # How many images? 
print(len(test_descriptions)) # How many images? 
print(max_length) # Maximum length of a caption (in words)

8332
2084
34


In [11]:
print(len(train_paths))
print(len(test_paths))

8332
2084


In [12]:
train_paths[0]

'../../s3/train/ucm_1080.jpg'

In [13]:
# add start and stop token
for v in train_descriptions: 
  for d in range(len(v)):
    v[d] = f'{START} {v[d]} {STOP}'

In [14]:
train_descriptions[0]

['startseq Lots of boats docked at the harbor and the boats are closed to each other . endseq',
 'startseq Lots of boats docked neatly at the harbor . endseq',
 'startseq Many boats docked neatly at the harbor and the water is deep blue . endseq',
 'startseq Many boats docked neatly at the harbor and some positions are free . endseq',
 'startseq Lots of boats docked neatly at the harbor and the boats are closed to each other . endseq']

In [15]:
# append all train captions
all_train_captions = []
for val in train_descriptions:
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

41660

In [16]:
all_train_captions[0]

'startseq Lots of boats docked at the harbor and the boats are closed to each other . endseq'

In [17]:
# remove words occur less than 10 times
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))

preprocessed words 2704 ==> 901


In [18]:
# build lookup tables
idxtoword = {}
wordtoidx = {}

ix = 1
for w in vocab:
    wordtoidx[w] = ix
    idxtoword[ix] = w
    ix += 1
    
vocab_size = len(idxtoword) + 1 
vocab_size

902

In [19]:
# adjust max_length
max_length +=2
print(max_length)

36


In [20]:

# load device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## 2. Load functions (they are required for loading trained model)

In [23]:
class CNNModel(nn.Module):

    def __init__(self, pretrained=True):
        """
        Initializes a CNNModel

        Parameters:
        -----------
        pretrained: bool (default: True)
            use pretrained model if True

        """

        super(CNNModel, self).__init__()
      
        # remove the classification layer
        layers = list(model.children())
        self.Conv2d_1a_3x3 = layers[0]
        self.Conv2d_2a_3x3 = layers[1]
        self.Conv2d_2b_3x3 = layers[2]
        self.Conv2d_3b_1x1 = layers[3]
        self.Conv2d_4a_3x3 = layers[4]
        self.Mixed_5b = layers[5]
        self.Mixed_5c = layers[6]
        self.Mixed_5d = layers[7]
        self.Mixed_6a = layers[8]
        self.Mixed_6b = layers[9]
        self.Mixed_6c = layers[10]
        self.Mixed_6d = layers[11]
        self.Mixed_6e = layers[12]
        self.Mixed_7a = layers[13]
        self.Mixed_7b = layers[14]
        self.Mixed_7c = layers[15]

        self.input_size = 299

    def forward(self, x, train=False):
        """
        forward of the CNNModel

        Parameters:
        -----------
        img_input: torch.Tensor
            the image matrix
        train: bool (default: False)
            use the model only for feature extraction if False

        Return:
        --------
        torch.Tensor
            image feature matrix
        """
#         if not train:
#           # set the model to evaluation model
#           self.model.eval()

        # N x 3 x 299 x 299
        x = self.Conv2d_1a_3x3(x)
        # N x 32 x 149 x 149
        x = self.Conv2d_2a_3x3(x)
        # N x 32 x 147 x 147
        x = self.Conv2d_2b_3x3(x)
        # N x 64 x 147 x 147
        x = F.max_pool2d(x, kernel_size=3, stride=2)
        # N x 64 x 73 x 73
        x = self.Conv2d_3b_1x1(x)
        # N x 80 x 73 x 73
        x = self.Conv2d_4a_3x3(x)
        # N x 192 x 71 x 71
        x = F.max_pool2d(x, kernel_size=3, stride=2)
        # N x 192 x 35 x 35
        x = self.Mixed_5b(x)
        # N x 256 x 35 x 35
        x = self.Mixed_5c(x)
        # N x 288 x 35 x 35
        x = self.Mixed_5d(x)
        # N x 288 x 35 x 35
        x = self.Mixed_6a(x)
        # N x 768 x 17 x 17
        x = self.Mixed_6b(x)
        # N x 768 x 17 x 17
        x = self.Mixed_6c(x)
        # N x 768 x 17 x 17
        x = self.Mixed_6d(x)
        # N x 768 x 17 x 17
        x = self.Mixed_6e(x)
        # N x 768 x 17 x 17
        # N x 768 x 17 x 17
        x = self.Mixed_7a(x)
        # N x 1280 x 8 x 8
        x = self.Mixed_7b(x)
        # N x 2048 x 8 x 8
        x = self.Mixed_7c(x)
        # N x 2048 x 8 x 8
        # Adaptive average pooling
        x = F.adaptive_avg_pool2d(x, (1, 1))
        # N x 2048 x 1 x 1
        x = F.dropout(x, training=train)
        # N x 2048 x 1 x 1
        x = torch.flatten(x, 1)
        # N x 2048
        
        return x

In [24]:
class RNNModel(nn.Module):

    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):
      
        """
        Initializes a RNNModel

        Parameters:
        -----------
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """

        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if embedding_matrix is not None:

            self.embedding.load_state_dict({
              'weight': torch.FloatTensor(embedding_matrix)
            })
            self.embedding.requires_grad = embedding_train

        self.dropout = nn.Dropout(p=0.5)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
 

    def forward(self, captions):
        """
        forward of the RNNModel

        Parameters:
        -----------
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        # embed the captions
        embedding = self.dropout(self.embedding(captions))

        outputs, (h, c) = self.lstm(embedding)

        return outputs, (h, c)



In [25]:
class CaptionModel(nn.Module):

    def __init__(
        self, 
        cnn_type, 
        vocab_size, 
        embedding_dim, 
        hidden_size=256,
        embedding_matrix=None, 
        embedding_train=False
    ):

        """
        Initializes a CaptionModel

        Parameters:
        -----------
        cnn_type: str
            the CNN type, either 'vgg16' or 'inception_v3'
        vocab_size: int
            the size of the vocabulary
        embedding_dim: int
            the number of features in the embedding matrix
        feature_size: int
            the number of features in the image matrix
        hidden_size: int (default: 256)
            the size of the hidden state in LSTM
        embedding_matrix: torch.Tensor (default: None)
            if not None, use this matrix as the embedding matrix
        embedding_train: bool (default: False)
            not train the embedding matrix if False
        """    
        super(CaptionModel, self).__init__() 

        # set feature_size based on cnn_type
        if cnn_type == 'vgg16':
            self.feature_size = 4096
        elif cnn_type == 'inception_v3':
            self.feature_size = 2048
        else:
            raise Exception("Please choose between 'vgg16' and 'inception_v3'.")  

        self.decoder = RNNModel(
            vocab_size, 
            embedding_dim,
            hidden_size,
            embedding_matrix,
            embedding_train
        )
        
        self.dropout = nn.Dropout(p=0.5)
        self.dense1 = nn.Linear(self.feature_size, hidden_size) 
        self.relu1 = nn.ReLU()
          
        self.dense2 = nn.Linear(hidden_size, hidden_size) 
        self.relu2 = nn.ReLU()
        self.dense3 = nn.Linear(hidden_size, vocab_size) 

    def forward(self, img_features, captions):
        """
        forward of the CaptionModel

        Parameters:
        -----------
        img_features: torch.Tensor
            the image feature matrix
        captions: torch.Tensor
            the padded caption matrix

        Return:
        --------
        torch.Tensor
            word probabilities for each position
        """

        img_features =\
        self.relu1(
            self.dense1(
                self.dropout(
                    img_features
                )
            )
        )

        decoder_out, _ = self.decoder(captions)

        # add up decoder outputs and image features
        outputs =\
        self.dense3(
            self.relu2(
                self.dense2(
                    decoder_out.add(
                        (img_features.view(img_features.size(0), 1, -1))\
                        .repeat(1, decoder_out.size(1), 1)
                    )
                )
            )
        )

        return outputs

## 3. Load pre-extracted image features and pre-trained model

In [27]:
# load model
model_path = f'{root_captioning}/training_outputs/caption-model_pytorch_inc_v3_full.hdf5'
caption_model = torch.load(model_path)

In [29]:
# load img features
with open(f'{root_captioning}/training_outputs/test_pytorch_inc_v3_full.pkl', 'rb') as f:
    test_img_features = pickle.load(f)

with open(f'{root_captioning}/training_outputs/train_pytorch_inc_v3_full.pkl', 'rb') as f2:
    train_img_features = pickle.load(f2)

## 4. Generate captions on test data and store as json data

In [30]:
# load generating function
def generateCaption(img_features):
    in_text = START

    for i in range(max_length):

        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = np.pad(sequence, (0, max_length - len(sequence)),
                          mode='constant', constant_values=(0, 0))
        caption_model.eval()
        yhat = caption_model(
            torch.FloatTensor(img_features)\
            .view(-1, caption_model.feature_size).to(device),
            torch.LongTensor(sequence).view(-1, max_length).to(device)
        )

        yhat = yhat.view(-1, vocab_size).argmax(1)
        word = idxtoword[yhat.cpu().data.numpy()[i]]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1 : -1]
    final = ' '.join(final)
    return final

In [42]:
# generate results
results = {}
for n in range(len(test_paths)):
    pic = test_paths[n]
    # note the filename splitting depends on path
    filename = pic.split('/')[4]
    img_features = test_img_features[n]
    generated = generateCaption(img_features)
    results[filename] = generated

In [49]:
# preview few of results
dict(list(results.items())[0:10])

{'rsicd_park_3.jpg': 'many green trees and some buildings are in a park .',
 'rsicd_mountain_176.jpg': 'it is a piece of irregular khaki mountain .',
 'rsicd_denseresidential_210.jpg': 'many buildings and green trees are in a dense residential area .',
 'rsicd_school_163.jpg': 'a baseball field is surrounded by some green trees and several buildings .',
 'ucm_997.jpg': 'A house with verdant lawn surrounded and a road beside in the sparse residential area .',
 'rsicd_industrial_240.jpg': 'many buildings are in an industrial area .',
 'rsicd_00613.jpg': 'a playground is surrounded by many buildings and some green trees .',
 'ucm_637.jpg': 'An intersection with two roads vertical to each other .',
 'rsicd_commercial_190.jpg': 'many buildings are in a commercial area .',
 'rsicd_meadow_94.jpg': 'it is a piece of khaki bare land.'}

In [51]:
# save the results in github repo
save_path = '../../591_capstone_2020-mda-mds/models/'
with open(save_path + 'test_results.json', 'w') as fp:
    json.dump(results, fp)

In [52]:
# save the results in s3 bucket
with open( f'{root_captioning}/training_outputs/test_results.json', 'w') as fp:
    json.dump(results, fp)