# Caption Generator

To generate the caption we can use some of the functions from the 'Deep Learning Modelling' notebook. 

I will create a new tonkenizer just for the image that we want to create a caption for. 

In [1]:
from pickle import load, dump
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.models import load_model

Using TensorFlow backend.


Functions for loading the .txt index files with the training and test file names.

- load_doc() loads, reads, and saves the text to the memory.
- load_set() loads the photo file names and saves them to the memory.

In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

Loading cleaned descriptors and file names, and returns a list of descriptors to file names. 

load_clean_descriptions() loads up the index that contains the list of image filenames and corresponding descriptors. Splits the filename from the description, and then saves them to a dictionary with the descriptions as the keys and filenames as the values. Each description is wrapped with 'startseq' at the start and 'endseq' at the end. 

In [3]:
def load_clean_descriptions(filename, dataset):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        captions = line.split()
        image_id, image_desc = captions[0], captions[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

to_lines() takes the descriptions dictionary, splits the filename from the descriptions and saves the descriptions it to a list.

In [4]:
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

create_tokenizer() takes the list of descriptions and fits it to the Keras Tokenizer(). After fitting the tokenizer we have a vocabulary size of 7579.

In [5]:
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer



We can load the tokenizer adhoc, without loading the full training set. 

In [6]:
filename = './instabang/instabang_text/instabang_train.txt'
train = load_set(filename)
print('Dataset:', len(train))

train_descriptions = load_clean_descriptions('insta_descriptions.txt', train)
print('Descriptions:', len(train_descriptions))

tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer_insta.pkl', 'wb'))

Dataset: 8667
Descriptions: 8667


### Loading the image to the model

First we load the tokenizer and then the model we have chosen from the 20 epochs define_model function. 

In [7]:
tokenizer = load(open('tokenizer_insta.pkl', 'rb'))
max_length = 437

In [8]:
model = load_model('./models/model-ep015-loss1.649-val_loss1.067.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Using the VGG16 model, we can extract the features of the input image and use the output from the VGG16 as input for our existing model. 

In [9]:
def extract_features(filename):
    model = VGG16()
    model.layers.pop()
    model = Model(input=model.inputs, output=model.layers[-1].output)
    image = load_img(filename, target_size=(224,224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    return feature

# photo = extract_features('./watchluke/watchluke_images/2020-01-13_10-16-53_UTC.jpg')

word_for_id() function maps integers to the words in the tokenizer.

In [10]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

First, we need to be able to generate a description for a photo using a trained model.

This involves passing in the start description token ‘startseq‘, generating one word, then calling the model recursively with generated words as input until the end of sequence token is reached ‘endseq‘ or the maximum description length is reached.

In [11]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [12]:
photo = extract_features('./instabang/instabang_images/2019-07-28_18-08-50_UTC.jpg')

description = generate_desc(model, tokenizer, photo, max_length)
print(description)

  after removing the cwd from sys.path.


ValueError: Error when checking input: expected input_4 to have shape (286,) but got array with shape (437,)