# Image Captioning Using Deep Learning With Attention Mechanism

In [3]:
import os
import json
import random
import numpy as np
from scipy.misc import imread, imresize
from collections import Counter
from tqdm import tqdm_notebook

In [4]:
import torch
import torch.nn as nn
import torchvision

## Set Configs

In [5]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Set Utils

In [None]:
def create_input_files(datasets, karpathy_json_path, image_dir, output_dir, captions_per_image, min_word_freq, max_length=100):
    
    assert datasets in {'coco', 'flickr8k', 'flickr30k'}
    
    # read Karpathy's json
    with open(karpathy_json_path, 'r') as file:
        data = json.load(file)
        
    # read image paths and captions for each image
    train_image_paths = []
    train_image_captions = []
    valid_image_paths = []
    valid_image_captions = []
    test_image_paths = []
    test_image_captions = []
    
    word_freq = Counter()
    
    for image in tqdm_notebook(data['images']):
        captions = []
        for sentence in image['sentences']:
            word_freq.update(sentence['tokens'])
            if len(sentence['tokens']) <= max_length:
                captions.append(sentence['tokens'])
                
        if len(captions) == 0:
            continue
            
        path = os.path.join(image_dir, image['filepath'], image['filename']) if datasets == 'coco' \
                                                                             else os.path.join(image_dir, image['filename'])
        
        if image['split'] in {'train', 'restval'}:
            train_image_paths.append(path)
            train_image_captions.append(captions)
        elif image['split'] in {'val'}:
            valid_image_paths.append(path)
            valid_image_captions.append(captions)
        elif image['split'] in {'test'}:
            test_image_paths.append(path)
            test_image_captions.append(captions)
            
    # sanity check
    assert len(train_image_paths) == len(train_image_captions)
    assert len(valid_image_paths) == len(valid_image_captions)
    assert len(test_image_paths) == len(test_image_captions)
    
    # create vocabulary
    words = [word for word in word_freq.keys() if word_freq[word] > min_word_freq]
    word_vocab = { key: value + 1 for value, key in enumerate(words)}
    word_vocab['<unk>'] = len(word_vocab) + 1
    word_vocab['<start>'] = len(word_vocab) + 1
    word_vocab['<end>'] = len(word_vocab) + 1
    word_vocab['<pad>'] = 0
    
    # create a base/ root name for all output files
    base_filename = datasets + '_' + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq'
    
    # save word vocabulary to a JSON
    with open(os.path.join(output_dir, 'data/WORD_VOCAB_' + base_filename + '.json'), 'w') as file:
        json.dump(word_vocab, file)
        
    # sample captions for each image, save images to HDF5 file and captions and their lengths to JSON files
    random.seed(9)
    for image_paths, image_captions, split in [(train_image_paths, train_image_captions, 'TRAIN'),
                                              (valid_image_paths, valid_image_captions, 'VALID'),
                                              (test_image_paths, test_image_captions, 'TEST')]:
        
        with h5py.File(os.path.join(output_dir, 'data/' + split + '_IMAGES_' + base_filename + '.hdf5'), 'a') as file:
            
            # make a note of the number of captions we are sampling per image
            file.attrs['captions_per_image'] = captions_per_image
            
            # create dataset inside HDF5 file to store images
            images = file.create_dataset('./datasets/images', (len(image_paths), 3, 256, 256), dtype='uint8')
            
            print(f'\nReading {split} images and captions, storing to file...\n')
            
            encoded_captions = []
            captions_length = []
            
            for i, path in enumerate(image_paths):
                
                # sample captions
                if len(image_captions[i]) < captions_per_image:
                    captions = image_captions[i] + [random.choice(image_captions[i]) for _ in range(captions_per_image - len(image_captions[i]))]
                else:
                    captions = random.sample(image_captions[i], k=captions_per_image)
                    
                # sanity check
                assert len(captions) == captions_per_image
                
                # read images
                image = imread(image_paths[i])
                if len(image.shape) == 2:
                    image = image[:, :, np.newaxis]
                    image = np.concatenate([image, image, image], axis=2)
                image = imresize(image, (256, 256))
                image = image.transpose(2, 0, 1)
                
                # sanity check
                assert image.shape == (3, 256, 256)
                assert np.max(image) <= 255
                
                # save image to HDF5 file
                images[i] = image
                
                for j, caption in enumerate(captions):
                    # encode captions
                    encoded_caption = [word_vocab['<start>']] + [word_vocab.get(word, word_vocab['<unk>']) for word in caption] +\
                                      [word_vocab['<end>']] + [word_vocab['<pad>']] * (max_length - len(caption))
                        
                    # find caption lengths
                    caption_length = len(caption) + 2
                    
                    encoded_captions.append(encoded_caption)
                    captions_length.append(caption_length)
            
            # sanity check
            assert images.shape[0] * captions_per_image == len(encoded_captions) == len(captions_length)
            
            # save encoded captions and their lengths to JSON files
            with open(os.path.join(output_dir, 'data/' + split + '_CAPTIONS_' + base_filename + '.json'), 'w') as file:
                json.dump(encoded_captions, file)
            
            with open(os.path.join(output_dir, 'data/' + split + '_CAPLENS_' + base_filename + '.json'), 'w') as file:
                json.dump(captions_length, file)

In [None]:
create_input_files(datasets='coco', karpathy_json_path='./datasets/karpathy_captions/datasets_coco.json',
                   image_dir='./datasets/', output_dir='./datasets/',
                   captions_per_image=5,
                   min_word_freq=5,
                   max_length=50)

## Set Data Loader

In [None]:
class CaptionDataset(torch.utils.data.Dataset):
    
    def __init__(self, data_folder, data_name, split, transform=None):
        
        super(CaptionDataset, self).__init__()
        
        self.split = split
        assert self.split in {'TRAIN', 'VALID', 'TEST'}
        
        # open hdf5 file where images are stored
        self.hdf5 = h5py.File(os.path.join(data_folder, '/data' + self.split + '_IMAGES_' + data_name + '.hdf5'), 'r')
        self.images = self.hdf5['images']
        
        # captions per image
        self.cpi = self.hdf5.attrs['captions_per_image']
        
        # load encoded captions (completely into memory)
        with open(os.path.join(data_folder, 'data/' + self.split + '_CAPTIONS_' + data_name + '.json'), 'r') as file:
            self.captions = json.load(file)
            
        # load captions lengths (completely into memory)
        with open(os.path.join(data_folder, 'data'/ + self.split + '_CAPLENS_' + data_name + '.json'), 'r') as file:
            self.caplens = json.load(file)
            
        # pytorch transformation pipeline for the image (normalizing, etc.)
        self.transform = transform
        
        # total number of data points
        self.dataset_size = len(self.captions)
        
    def __getitem_(self, i):
        
        # remember, the Nth caption corresponds to the (N // captions_per_image)th image
        image = torch.FloatTensor(self.images[i // self.cpi] / 255.)
        if self.transform is not None:
            image = self.transform(image)
            
        caption = torch.LongTensor(self.captions[i])
        caplen = torch.LongTensor([self.caplens[i]])
        
        if self.split is 'TRAIN':
            return image, caption, caplen
        else:
            # for validation of testing, also return all 'captions_per_image' captions to find BLEU-4 score
            all_captions = torch.LongTensor(
                self.captions[((i // self.cpi) * self.cpi) : (((i // self.cpi) * self.cpi) + self.cpi)])
            return image, caption, caplen, all_captions
        
    def __len__(self):
        return self.dataset_size

## Build [Image Captioning](https://arxiv.org/pdf/1411.4555.pdf) Network with [Attention](https://arxiv.org/pdf/1502.03044.pdf)

In [None]:
class EncoderCNN(nn.Module):
    
    def __init__(self, image_size=14):
        
        super(EncoderCNN, self).__init__()
        
        self.image_size = image_size
        
        # import pre-trained ImageNet ResNet-101
        resnet = torchvision.models.resnet101(pretrained=True)
        
        # remove linear and pool layers
        modules = list(resnet.children())[:-2]
        self.resnet_layer = nn.Sequential(*modules)
        
        # resize image to fixed size to allow input image of variable size
        self.adaptive_pool_layer = nn.AdaptiveAvgPool2d((image_size, image_size))
        
        # this will enable or disable the calculation of gradients for the Encoder's parameters
        self.fine_tune()
        
    def fine_tune(self, is_fine_tune=True):
        
        for param in self.resnet_layer.parameters():
            param.requires_grad = False
        
        # if fine-tuning, then only fine-tune convolutional blocks 2 through 4
        for child in list(self.resnet_layer.children())[5:]:
            for param in child.parameters():
                param.requires_grad = is_fine_tune
                
    def forward(self, images):
        
        feature_vectors = self.resnet_layer(images) # (batch_size, 2048, image_size/ 32, image_size/ 32)
        feature_vectors = self.adaptive_pool_layer(feature_vectors) # (batch_size, 2048, image_size/ 32, image_size/ 32)
        feature_vectors = feature_vectors.permute(0, 2, 3, 1) # (batch_size, image_size, image_size, 2048)
        
        return feature_vectors

In [138]:
class Attention(nn.Module):
    
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        
        super(Attention, self).__init__()
        
        self.encoder_attention_layer = nn.Linear(encoder_dim, attention_dim) # linear layer to transform encoded image
        self.decoder_attention_layer = nn.Linear(decoder_dim, attention_dim) # linear layer to transform decoder's output
        self.total_attention_layer = nn.Linear(attention_dim, 1) # linear layer to calculate values to be softmax-ed
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1) # softmax layer to calculate weights
        
    def forward(self, encoder_output, decoder_hidden):
        
        encoder_attention = self.encoder_attention_layer(encoder_output) # (batch_size, num_pixels, attention_dim)
        decoder_attention = self.decoder_attention_layer(decoder_hidden) # (batch_size, attention_dim)
        total_attention = self.total_attention_layer(self.relu(encoder_attention + decoder_attention.unsqueeze(1))).squeeze(2) # (batch_size, num_pixels)
        alpha = self.softmax(total_attention) # (batch_size, num_pixels)
        attention_weighted_encoding = (encoder_attention * alpha.unsqueeze(2)).sum(dim=1) # (batch_size, encoder_dim)
        
        return attention_weighted_encoding, alpha

In [140]:
class AttentionDecoderRNN(nn.Module):
    
    def __init__(self, attention_dim, embedding_dim, decoder_dim, vocab_size, encoder_dim=2048, dropout=0.5):
        
        super(AttentionDecoderRNN, self).__init__()
        
        self.attention_dim = attention_dim
        self.embedding_dim = embedding_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.encoder_dim = encoder_dim
        self.dropout = dropout
        
        # init attention network
        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(self.dropout)
        self.decode_step = nn.LSTMCell(embedding_dim + encoder_dim, decoder_dim, bias=True)
        self.init_hidden = nn.Linear(encoder_dim, decoder_dim) # linear layer to find initial hidden state of LSTM
        self.init_cell = nn.Linear(encoder_dim, decoder_dim) # linear layer to find initial cell state of LSTM
        self.beta = nn.Linear(decoder_dim, encoder_dim) # linear layer to create a sigmoid-activated gate
        self.sigmoid = nn.Sigmoid()
        self.fc_layer = nn.Linear(decoder_dim, vocab_size) # linear layer to find scores over vocabulary
        
        self.init_weights() # initialize some layers with the uniform distribution
        
    def init_weights(self):
        
        self.embedding_layer.weight.data.uniform_(-0.1, 0.1)
        self.fc.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)

    def init_state(self, encoder_output):
        
        mean_encoder_output = encoder_output.mean(dim=1)
        hidden = self.init_hidden(mean_encoder_output)
        cell = self.init_cell(mean_encoder_output)
        
        return hidden, cell
    
    def load_pretrained_embeddings(self, embeddings):
        
        self.embedding_layer.weight = nn.Parameter(embeddings)
        
    def fine_tune_embeddings(self, is_fine_tune=True):
        
        for param in self.embedding_layer.parameters():
            param.requires_grad_ = is_fine_tune
            
    def forward(self, encoder_output, encoded_captions, caption_lengths):
        
        batch_size = encoder_output.size()
        encoder_dim = encoder_output.size(-1)
        vocab_size = self.vocab_size
        
        # flatten image
        encoder_output = encoder_output.view(batch_size, -1, encoder_dim) # (batch_size, num_pixels, encoder_dim)
        num_pixels = encoder_output.size(1)
        
        # sort input data by decreasing lengths
        caption_lengths, sort_id = caption_lengths.squeeze(1).sort(dim=0, descending=True)
        encoder_output = encoder_output[sort_id]
        encoded_captions = encoded_captions[sort_id]
        
        # embedding
        embeddings = self.embedding_layer(encoded_captions) # (batch_size, max_caption_length, embedding_dim)
        
        # init LSTM state
        decoder_hidden, decoder_cell = self.init_state(encoder_output) # (batch_size, decoder_dim)
        
        # since generation process finished as soon as model generate <end> so decoding lengths are actual lengths - 1
        decode_lengths = (caption_lengths - 1).tolist()
        
        # create tensors to hold word prediction scores and alphas
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(device)
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)
        
        # at each time-step, decode by attention weights the encoder's output based on the decoder's previous hidden state
        # then generate a new word in the decoder with the previous word and the attention-weighted encoding
        
        for d_time in range(max(decode_lengths)):
            batch_size_time = sum([length > d_time for length in decode_lengths])
            attention_weighted_encoding, alpha = self.attention(encoder_output[:batch_size_time],
                                                                decoder_hidden[:batch_size_time])
            
            gate = self.sigmoid(self.beta(decoder_hidden[:batch_size_time])) # (batch_size_time, encoder_dim)
            attention_weighted_encoding = gate * attention_weighted_encoding
            
            decoder_hidden, decoder_cell = self.decode_step(
                torch.cat([embeddings[:batch_size_time, d_time, :], attention_weighted_encoding], dim=1),
                (decoder_hidden[:batch_size_time], decoder_cell[:batch_size_time])) # (batch_size_time, decoder_dim)
            
            prediction = self.fc_layer(self.dropout(decoder_hidden)) # (batch_size_time, vocab_size)
            predictions[:batch_size_time, d_time, :] = prediction
            alphas[:batch_size_time, d_time, :] = alpha
            
        return predictions, encoded_captions, decode_lengths, alphas, sort_id

---