# Image Captioning Using Deep Learning

In [1]:
import os
import argparse
import numpy as np
import matplotlib.pyplot as plt

import nltk
import pickle
from PIL import Image
from collections import Counter
from pycocotools.coco import COCO

In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable

## Set Configs

In [3]:
# image configs
IMAGE_SIZE = 256
IMAGE_DIR = './datasets/train2014/'
RESIZED_IMAGE_DIR = './datasets/resized2014/'

# model configs
CROP_SIZE = 224
EMBEDDING_SIZE = 256
HIDDEN_SIZE = 512
N_LAYERS = 1
N_EPOCHS = 5
BATCH_SIZE = 128
LR = 0.001

In [4]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Prepare Images

In [5]:
def load_image(image_path, transform=None):
    
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None: image = transform(image).unsqueeze(0)
        
    return image

In [6]:
def resize_image(image, size):
    
    image = image.resize(size, Image.ANTIALIAS)
    return image

In [7]:
def resize_images(image_dir, output_dir, image_size):
    
    size = [image_size, image_size]
    
    if not os.path.exists(output_dir): os.makedirs(output_dir)
        
    images = os.listdir(image_dir)
    num_images = len(images)
    
    for index, image in enumerate(images):
        with open(os.path.join(image_dir, image), 'r+b') as f:
            with Image.open(f) as image:
                image = resize_image(image, size)
                image.save(os.path.join(output_dir, image), image.format)
        
        if(i+1) % 100 == 0:
            print(f'[{i+1}/{num_images}] Resized the images and saved into {output_dir}')

In [None]:
resize_images(IMAGE_DIR, RESIZED_IMAGE_DIR, IMAGE_SIZE)

## Prepare Captions

In [9]:
class Vocabulary():
    
    def __init__(self):
        
        super(Vocabulary, self).__init__()
        
        self.word2index = {}
        self.index2word = {}
        self.num_words = 0
        
    def add_word(self, word):
        
        if not word in self.word2index:
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.num_words += 1
            
    def __call__(self, word):
        
        if not word in self.word2index:
            return self.word2index['<unknown>']
        return self.word2index[word]
    
    def __len__(self):
        return len(self.word2index)

In [10]:
def build_vocabulary(caption_path, min_word_count):
    
    coco = COCO(caption_path)
    counter = Counter()
    
    ids = coco.anns.keys()
    for i, index in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)
        
        if (i+1) % 1000 == 0: print(f'[{i+1}/{len(ids)}] Tokenized the captions.')
    
    # if the word frequency is less than 'min_word_count', then the word is discarded
    words = [word for word, count in counter.items() if count >= min_word_count]
    
    # create a vocab wrapper and add some special tokens
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unknown>')
    
    # add the words to the vocabulary
    for index, word in enumerate(words): vocab.add_word(word)
    
    # save vocabulary into pickle format
    with open(vocabulary_path, 'wb') as f:
        pickle.dump(vocabulary, f)

    return vocabulary

In [None]:
vocabulary = build_vocabulary(caption_path='datasets/annotations/captions_train2014.json', 
                              vocabulary_path='./datasets/vocabulary.pkl', min_word_count=4)

print(f'Total Vocabulary Size: {len(vocabulary)}')
print(f'Saved the vocabulary wrapper to {vocabulary_path}')

## Set Data Loader

In [11]:
class COCODataset(torch.utils.data.Dataset):
    
    def __init__(self, image_dir, coco_path, vocab, transform=None):
        
        super(COCODataset, self).__init__()
        
        self.image_dir = image_dir
        self.coco = COCO(coco_path)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.transform = transform
        
    def __getitem__(self, index):
        
        coco = self.coco
        vocab = self.vocab
        annot_id = self.ids[index]
        image_id = coco.anns[annot_id]['image_id']
        caption = coco.anns[annot_id]['caption']
        path = coco.loadImgs(img_id)[0]['file_name']
        
        image = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)
            
        # convert caption (string) to word index
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        
        return image, target
    
    def __len__(self):
        return len(self.ids)

In [12]:
def collate_fn(data):
    
    # create mini-batch tensors from the list of tuples (image, caption)
    
    # sort a data list by caption length (descending order)
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)
    
    # merge images (from tuple of 3D tensor to 4D tensor)
    images = torch.stack(images, 0)
    
    # merge captions (from tuple of 1D tensor to 2D tensor)
    lengths = [len(caption) for caption in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    
    for index, caption in enumerate(captions):
        end = lengths[index]
        targets[index, :end] = caption[:end]
        
    return images, targets, lengths

In [13]:
def get_data_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
    
    coco = COCODataset(root=root, json=json, vocab=vocab, transform=transform)
    
    # this data loader will return (images, captions, length) for each iteration
    # images: a tensor of shape (batch_size, 3, 224, 224), 
    # captions: a tensor of shape (batch_size, padded_length)
    # lengths: a list indicating valid length for each caption
    data_loader = torch.utils.data.DataLoader(dataset=coco,
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    
    return data_loader

## Build [Image Captioning](https://arxiv.org/pdf/1411.4555.pdf) Network

In [14]:
class EncoderCNN(nn.Module):
    
    def __init__(self, embedding_size):
        
        super(EncoderCNN, self).__init__()
        
        resnet = models.resnet152(pretrained=True) # use pre-trained resnet model
        modules = list(resnet.children())[:-1] # remove the last fully-connected layer
        
        self.resnet_layer = nn.Sequential(*modules)
        self.fc_layer = nn.Linear(resnet.fc.in_features, embedding_size)
        self.norm = nn.BatchNorm1d(embedding_size, momentum=0.01)
        
        self.init_weights()
        
    def init_weights(self):
        self.fc_layer.weight.data.normal_(0.0, 0.02)
        self.fc_layer.bias.data.fill_(0)
        
    def forward(self, images):
        
        feature_vectors = self.resnet_layer(images)
        feature_vectors = Variable(feature_vectors.data)
        feature_vectors = feature_vectors.view(feature_vectors.size(0), -1)
        feature_vectors = self.norm(self.fc_layer(feature_vectors))
        
        return feature_vectors

In [15]:
class DecoderRNN(nn.Module):
    
    def __init__(self, embedding_size, hidden_size, vocab_size, n_layers):
        
        super(DecoderRNN, self).__init__()
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_size)
        self.lstm_layer = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True)
        self.fc_layer = nn.Linear(hidden_size, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        self.embedding_layer.weight.data.uniform_(-0.1, 0.1)
        self.fc_layer.weight.data.uniform_(-0.1, 0.1)
        self.fc_layer.bias.data.fill_(0)
        
    def sample(self, features, states=None):
        
        sample_ids = []
        inputs = features.unsqueeze(1)
        
        max_sampling_length = 20
        
        for i in range(max_sampling_length):
            # hiddens shape: (batch_size, 1, hidden_size), states shape: (batch_size, vocab_size)
            hiddens, states = self.lstm_layer(inputs, states)
            outputs = self.fc_layer(hiddens.squeeze(1))
            prediction = outputs.max(1)[1]
            sample_ids.append(prediction)
            inputs = self.embedding_layer(prediction)
            inputs = inputs.unsqueeze(1)
        
        sample_ids = torch.cat(sample_ids, 1)
        return sample_ids.squeeze()
        
    def forward(self, feature_vectors, source_captions, lengths):
        
        embeds = self.embedding_layer(source_captions)
        embeds = torch.cat((feature_vectors.unsqueeze(1), embeds), 1)
        packed = pack_padded_sequence(embeds, lengths, batch_first=True)
        
        hiddens, _ = self.lstm_layer(packed)
        outputs = self.fc_layer(hiddens[0])
        
        return outputs

#### Initialize Image Captioning Network

In [16]:
encoder = EncoderCNN(EMBEDDING_SIZE)
encoder.to(device)

EncoderCNN(
  (resnet_layer): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Con

In [None]:
decoder = DecoderRNN(EMBEDDING_SIZE, HIDDEN_SIZE, len(vocabulary), N_LAYERS)
decoder.to(device)

## Set Loss Function

In [18]:
ce_loss = nn.CrossEntropyLoss()

## Set Optimizer

In [None]:
params = list(decoder.parameters()) + list(encoder.fc_layer.parameters()) + list(encoder.norm.parameters())
optimizer = torch.optim.Adam(params, lr=LR)

---