In [1]:
import torch
import torch.nn as nn
from torchvision import transforms 
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
import pickle 
import matplotlib.pyplot as plt
import numpy as np 
from PIL import Image
import os
import pandas as pd

# Building Image Captioning in PyTorch

The goal of image captioning is to describe a given image using natural language. Using neural networks, the problem can be partitioned into two separate challenges. First, we need to extract meaningful features regarding the image that would help us describe it. Second, we need to generate a sequence of words that best fit those features. The flexability of neural networks allows us to take a CNN architecture and connect it directly to a LSTM network. We only need to provide proper labels to train the new network we created. 
We will use pretrained networks for both feature extraction and sentence generation, and we will connect the different components needed to make image captioning work.


First, we define an encoder and decoder networks seperately. The encoder takes an image and produces a latent vector of features. As those features hold information about the image, we will use that vector as the input for our decoder. The RNN decoder will produce the image captioning using an LSTM architecture. 

First, download the pretrained [resnet152](https://www.dropbox.com/s/ne0ixz5d58ccbbz/pretrained_model.zip?dl=0) and [vocabulary](https://www.dropbox.com/s/26adb7y9m98uisa/vocap.zip?dl=0) and place them in the following folder hierarchy.

In [2]:
slogans_path = 'SlogansAndTypes.csv'
encoder_path = 'pretrained_model/encoder-5-3000.pkl'
decoder_path = 'pretrained_model/decoder-5-3000.pkl'
vocab_path   = 'vocab.pkl'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

# check if all the files are in place
print("slogans_path ", '✓' if os.path.isfile(slogans_path) == True else '✗') 
print("encoder_path ", '✓' if os.path.isfile(encoder_path) == True else '✗') 
print("decoder_path ", '✓' if os.path.isfile(decoder_path) == True else '✗') 
print("vocab_path ", '✓' if os.path.isfile(vocab_path) == True else '✗') 

slogans_path  ✗
encoder_path  ✗
decoder_path  ✗
vocab_path  ✗


In [3]:
embed_size   = 256      # dimension of word embedding vectors
hidden_size  = 512      # dimension of lstm hidden states
num_layers   = 1        # number of layers in lstm

# Define the architectures

When loading pretrained networks in PyTorch, a common practice is to download the weights. In this case, you are required to define the same architecture in order to use a simple function that places the parameters in the correct place in the network. However, PyTorch has built-in functions to load the **architectures** of the networks we will be using. Later on, we will use the files we downloaded to load the initialized architectures with pretrained weights.

In [4]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        #############################################################################
        # Load the pretrained ResNet-152 network and replace the top fully          #
        # connected layer, so we could pass the features of the network and not the #
        # classification which carries significantly less information.              #
        # Afterwards, create a new sequential model which includes the resnet and   #
        # add a new fully connected layer that outputs a vector with the size of    #
        # the wanted embedding. Next, add a batchnorm layer                         #
        # This function has no return value.                                        #
        #############################################################################
        resnet = models.resnet152()
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
       
    def forward(self, images):
        #############################################################################
        # Define the forward propagation. You need to pass an image through the     #
        # network and extract the feature vector. In this case, when using a        #
        # perdefined network, you don't want to change it's weights.                #
        # The rest of the layers you defined should accepts gradients for them to   #
        # improve during training. Make sure you are inputing a correct shape       #
        # to the batchnorm layer.                                                   #
        # This function returns the features of the image as a vector               #
        #############################################################################
        with torch.no_grad():
            features = self.resnet(images)

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        super(DecoderRNN, self).__init__()
        self.max_seg_length = max_seq_length
        #############################################################################
        # Define the hyper-parameters and the layers of the pretrained LSTM.        #
        # Create an Embedding layer that accepts the output of the                  #
        # feature extractor.  Next, the built-in LSTM architecture in PyTorch.nn    #
        # with the proper inputs (use the built-in documentation tool in Jupyter    #
        # or just look at the official documentation online).                       #
        # Define an additional linear layer that comes after the LSTM and outputs   #
        # a vector that will support the size of our vocabulary.                    #
        # This function has no return value.                                        #
        #############################################################################
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    
    
    def forward(self, features, captions, lengths):
        #############################################################################
        # Decode image feature vectors and generate captions.                       #
        # Since we do not need to train the network, you don't need to write the    # 
        # forward propagation.                                                      #
        #############################################################################
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens = None
        outputs = None
        return outputs    
    
    def sample(self, features, states=None):
        sampled_ids = []
        inputs = features.unsqueeze(1)
        #############################################################################
        # Generate captions for a given image features.                             #
        # First, obtain the output of the LSTM network (How many are they?).        #
        # Next, use the hidden states to obtain the most probable word and store    #
        # all the word predictions in the sampled_ids list. Don't forget to update  #
        # the inputs for each timestep to continue making predictions based on the  #
        # words you are alreaedy predicted.                                         #
        # Make sure you keep track of the dimensions of the inputs and outputs,     #
        # since PyTorch expects tensors with a batch dimension. You can use the     #
        # methods .squeeze() and .unsqueeze()                                       #
        # This function returns the list of predicted words.                        #
        #############################################################################
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)
            outputs = self.linear(hiddens.squeeze(1))
            _, predicted = outputs.max(1)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)
            inputs = inputs.unsqueeze(1)
        sampled_ids = torch.stack(sampled_ids, 1)       

        return sampled_ids

In [5]:
def load_text(slogan_path, transform=None):
    slogans = pd.read_csv(slogan_path)
    return slogans

In [6]:
def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    return image

class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

In [7]:
transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Normalize((0.485, 0.456, 0.406), 
                         (0.229, 0.224, 0.225))])

In [8]:
# Load vocabulary wrapper
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'vocab.pkl'

In [9]:
# Build models
encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
encoder = encoder.to(device)
decoder = decoder.to(device)

# Load the trained model parameters
encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))

# Prepare an image
image = load_text(slogans_path, transform)
image_tensor = image.to(device)

NameError: name 'vocab' is not defined

In [None]:
# Generate an caption from the image
feature = encoder(image_tensor)
sampled_ids = decoder.sample(feature)
sampled_ids = sampled_ids[0].cpu().numpy()

# Convert word_ids to words
sampled_caption = []
for word_id in sampled_ids:
    word = vocab.idx2word[word_id]
    sampled_caption.append(word)
    if word == '<end>':
        break
sentence = ' '.join(sampled_caption)

# Print out the image and the generated caption
print(sentence)
image = Image.open(image_path)
plt.imshow(np.asarray(image))
