## Image caption generator 

Some explanation of project and the idea behind it

The training image dataset will be the Flickr8k dataset with 8092 JPEG images that are each paired with five different captions. Captions are located in another folder named Flickr8k_text with multiple textual documents regarding image captions. The dataset is available for free. The dataset is divided in 3 different groups -- training dataset (6000 images), development/validation dataset (1000 images) and test dataset (1000 images).

The pretrained model will be used for extracting features from images. The one used in this project is pretrained on ImageNet dataset. There are many options for this model but the ones tested here are going to be VGG (Oxford Visual Geometry Group that won the ImageNet competition in 2014) and Xception which can both be accessed from Keras directly.

The metric for model valiation will be BLEU score.

### 1. Import necessary packages

In [36]:
import pandas as pd
import string
import numpy as np
from PIL import Image
import os
import pydot
import graphviz
from pickle import dump, load
from numpy import array

from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.utils.vis_utils import plot_model
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

# small library for seeing the progress of loops.
from tqdm.notebook import tqdm
tqdm().pandas()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

  from pandas import Panel


In [2]:
data_text = '/home/maja/Documents/Maja/Projects/Image caption generator/Flickr8k_text'
data_images = '/home/maja/Documents/Maja/Projects/Image caption generator/Flickr8k_Dataset/Flicker8k_Dataset'

## 2. Prepare photo data and extract features from model

Using pre-trained model VGG

In [37]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model

# extract features from each photo in the directory with VGG model
def extract_features(directory):
    # load the model
    in_layer = Input(shape=(224, 224, 3))
    model = VGG16(include_top=False, input_tensor=in_layer, pooling='avg')
    print(model.summary())
    #model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    features = dict()
    for file in listdir(directory):
        filename = directory + '/' + file
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        feature = model.predict(image, verbose=False)
        image_id = file.split('.')[0]
        features[image_id] = feature
        print('>%s' % file)
    return features

In [None]:
features = extract_features(data_images)
#print('Extracted features: %d' % len(features))

In [39]:
dump(features, open('features.pkl', 'wb'))

### 2. Loading and cleaning text data - identifiers and captions

In [5]:
# loading text file
def load_file(filename):
    file = open(filename, 'r')
    text=file.read()
    file.close()
    return text

In [6]:
#load_file(data_text + '/' + 'Flickr8k.token.txt').split('\n')

In [44]:
# get all captions on images from text file
def get_all_captions(filename):
    file = load_file(filename)
    new_line_split = file.split('\n')
    captions = {}
    for new_line in new_line_split[:-1]:
        img, caption = new_line.split('\t')
        img = img.split('.')[0]
        if img not in captions:
            captions[img] = [caption]
        else:
            captions[img].append(caption)
    return captions

In [55]:
def load_captions(doc):
    doc = load_file(doc)
    mapping = dict()
	# process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
		# take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
        image_id = image_id.split('.')[0]
		# convert description tokens back to string
        image_desc = ' '.join(image_desc)
		# store the first description for each image
        if image_id not in mapping:
            mapping[image_id] = image_desc
    return mapping

In [68]:
# get the captions and count the number of loaded captions
captions = load_captions(data_text + '/' + 'Flickr8k.token.txt')
print('The number of loaded image captions: %d' % len(captions))

The number of loaded image captions: 8092


In [41]:
# data cleaning of captions (lowercase, punctuation, numbers, signs)
def clean_captions(captions):
    table = str.maketrans('', '', string.punctuation)
    for k, v in captions.items():
        for i in range(len(v)):
            val = v[i]
            val = val.translate(table).split()
            val = [word.lower() for word in val]
            val = [word for word in val if len(word)>1]
            val = [word for word in val if word.isalpha()]
            v[i] = ' '.join(val)
        captions[k] = v    
    return captions

In [42]:
clean_captions = clean_captions(captions)

### 3. Creating vocabulary and saving clean captions into a file

In [11]:
# create vocabulary/set from the dictionary of image captions
def create_caption_vocabulary(captions):
    vocab = set()
    for k in captions.keys():
        [vocab.update(caption.split()) for caption in captions[k]]
    return vocab 

In [73]:
# get the number of unique word in the vocabulary of captions
all_captions = ' '.join(captions.values()).split()
vocab = set(all_captions)
print('The length of vocabulary is: %s' % len(vocab))

The length of vocabulary is: 4657


In [13]:
# save the cleaned captions to a file
def save_clean_captions(captions, filename):
    cap = list()
    for k, v in captions.items():
        for caption in v:
            cap.append(k + ' ' + caption)
    data = '\n'.join(cap)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [14]:
save_clean_captions(clean_captions, 'clean_captions.txt')
#open('clean_captions.txt', 'r').read()

### Develop deep learning model

1. Loading data
2. Defining the model
3. Fitting the model

#### Loading data

In [15]:
# ?? Extract feature vector?? only for Xception or all ?

In [85]:
# load clean descriptions into memory
def load_clean_captions(filename):
    doc = load_file(filename)
    captions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_caption = tokens[0], tokens[1:]
        # store
        captions[image_id] = ' '.join(image_caption)
    return captions

In [86]:
captions = load_clean_captions('clean_captions.txt')

In [16]:
# load training image data
def load_photo_identifiers(filename):
    doc = load_file(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return dataset

In [17]:
filename = data_text + '/' + 'Flickr_8k.trainImages.txt'
train_identifiers = load_photo_identifiers(filename)
print('Train image identifiers: %d' % len(train_identifiers))

Train image identifiers: 6000


In [18]:
# load train image captions 
def load_captions(filename, dataset):
    doc = load_file(filename)
    captions = dict()
    for line in doc.split('\n'):
        identifier, *caption = line.split()
        if identifier in dataset:
            if identifier not in captions:
                captions[identifier] = list()
            tokenized_caption = 'startseq ' + ' '.join(caption) + ' endseq'
            captions[identifier].append(tokenized_caption)
    return captions

In [19]:
train_captions = load_captions('clean_captions.txt', train_identifiers)
print('Train image captions: %d' % len(train_captions))

Train image captions: 6000


In [20]:
# load photo features from pickle
def load_photo_features(filename, dataset):
	all_features = load(open(filename, 'rb'))
	features = {k: all_features[k] for k in dataset}
	return features

In [21]:
train_features = load_photo_features('features.pkl', train_identifiers)
print('Photos (train): %d' % len(train_features))

Photos (train): 6000


In [22]:
# create a list of captions from dictionary of clean captions
# def to_list(captions):
#     all_captions = list()
#     for k in captions.keys():
#         [all_captions.append(cap) for cap in captions[k]]
#     return all_captions

In [81]:
# data generator to fit on 8GB RAM machines; to be used in a call to model.fit_generator()
def data_generator(captions, tokenizer, max_length, data_images):
    while 1:
        for name in listdir(data_images):
            filename = data_images + '/' + name
            image, image_id = load_photo(filename)
            caption = captions[image_id]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, caption, image)
            yield [[in_img, in_seq], out_word]

In [88]:
# fit a tokenizer given caption description
def create_tokenizer(captions):
    caption_list = list(captions.values())
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(caption_list)
    return tokenizer

In [89]:
tokenizer = create_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 4469


In [80]:
def load_photo(filename):
	image = load_img(filename, target_size=(224, 224))
	# convert the image pixels to a numpy array
	image = img_to_array(image)
	# reshape data for the model
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	# prepare the image for the VGG model
	image = preprocess_input(image)[0]
	# get image id
	image_id = filename.split('/')[-1].split('.')[0]
	return image, image_id

In [100]:
# calculate the length of the description with the most words
def max_length(captions):
    return max(len(c.split()) for c in list(captions.values()))

In [101]:
max_length = max_length(captions)
print('Max length: %d' % max_length)

Max length: 32


In [27]:
# transform data into input-output pairs for training the model
# create sequences of images, input sequences and output words for an image
# def create_sequences(tokenizer, max_length, captions, feature, vocab_size):
#     X1, X2, y = list(), list(), list()
#     for caption in captions:
#         seq = tokenizer.texts_to_sequences([caption])[0]
#         for i in range(1, len(seq)):
#             in_seq, out_seq = seq[:i], seq[i]
#             in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
#             out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
#             X1.append(feature)
#             X2.append(in_seq)
#             y.append(out_seq)
#     return array(X1), array(X2), array(y)

In [114]:
def create_sequences(tokenizer, max_length, captions, image):
    Ximages, XSeq, y = list(), list(),list()
    vocab_size = len(tokenizer.word_index) + 1
    # integer encode the description
    seq = tokenizer.texts_to_sequences([captions])[0]
    # split one sequence into multiple X,y pairs
    for i in range(1, len(seq)):
        # select
        in_seq, out_seq = seq[:i], seq[i]
        # pad input sequence
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        Ximages.append(image)
        XSeq.append(in_seq)
        y.append(out_seq)
    Ximages, XSeq, y = array(Ximages), array(XSeq), array(y)
    return [Ximages, XSeq, y]

In [115]:
generator = data_generator(captions, tokenizer, max_length, data_images)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(11, 224, 224, 3)
(11, 32)
(11, 4469)


#### Defining the model

In [116]:
# define the captioning model
def define_model(vocab_size, max_length):
    # feature extraxtor model
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model


In [119]:
# define the model
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoch
epochs = 10
steps = len(captions)
for i in range(epochs):
    # fit for one epoch
    model.fit_generator(data_generator(captions, tokenizer, max_length, data_images), steps_per_epoch=steps, verbose=5)
    # save model
    model.save('models/model_' + str(i) + '.h5')

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 32, 256)      1144064     input_12[0][0]                   
__________________________________________________________________________________________________
dropout_8 (Dropout)             (None, 2048)         0           input_11[0][0]                   
____________________________________________________________________________________________

ValueError: in user code:

    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:541 train_step  **
        self.trainable_variables)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1804 _minimize
        trainable_variables))
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:521 _aggregate_gradients
        filtered_grads_and_vars = _filter_grads(grads_and_vars)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1219 _filter_grads
        ([v.name for _, v in grads_and_vars],))

    ValueError: No gradients provided for any variable: ['embedding_4/embeddings:0', 'dense_12/kernel:0', 'dense_12/bias:0', 'lstm_4/lstm_cell_4/kernel:0', 'lstm_4/lstm_cell_4/recurrent_kernel:0', 'lstm_4/lstm_cell_4/bias:0', 'dense_13/kernel:0', 'dense_13/bias:0', 'dense_14/kernel:0', 'dense_14/bias:0'].


In [78]:
# test dataset

# load test set identifiers
filename = data_text + '/' + 'Flickr_8k.devImages.txt'
test_identifiers = load_photo_identifiers(filename)
print('Dataset: %d' % len(test_identifiers))
# load test set captions
test_captions = load_captions('clean_captions.txt', test_identifiers)
print('Train image captions: %d' % len(test_captions))
# load test set features
test_features = load_photo_features('features.pkl', test_identifiers)
print('Photos (train): %d' % len(test_features))
tokenizer = create_tokenizer(test_captions)
vocab_size = len(tokenizer.word_index) + 1
# prepare sequences
#X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_captions, test_features, vocab_size)

Dataset: 1000
Train image captions: 1000
Photos (train): 1000


In [80]:
# define the model
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoc
epochs = 20
steps = len(test_captions)
for i in range(epochs):
    # create the data generator
    generator = data_generator(test_captions, test_features, tokenizer, max_length)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
    model.save('model_' + str(i) + '.h5')

Model: "model_21"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           [(None, 34)]         0                                            
__________________________________________________________________________________________________
input_45 (InputLayer)           [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 34, 256)      849152      input_46[0][0]                   
__________________________________________________________________________________________________
dropout_44 (Dropout)            (None, 4096)         0           input_45[0][0]                   
___________________________________________________________________________________________

ValueError: in user code:

    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:541 train_step  **
        self.trainable_variables)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1804 _minimize
        trainable_variables))
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:521 _aggregate_gradients
        filtered_grads_and_vars = _filter_grads(grads_and_vars)
    /home/maja/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1219 _filter_grads
        ([v.name for _, v in grads_and_vars],))

    ValueError: No gradients provided for any variable: ['embedding_22/embeddings:0', 'dense_65/kernel:0', 'dense_65/bias:0', 'lstm_22/lstm_cell_22/kernel:0', 'lstm_22/lstm_cell_22/recurrent_kernel:0', 'lstm_22/lstm_cell_22/bias:0', 'dense_66/kernel:0', 'dense_66/bias:0', 'dense_67/kernel:0', 'dense_67/bias:0'].
