# Instagram Deep Learning

Loading descriptors and image files.

In [1]:
from numpy import array, argmax
from pickle import load, dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.utils import to_categorical, plot_model
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint, TensorBoard

from nltk.translate.bleu_score import corpus_bleu

Using TensorFlow backend.


Functions for loading the .txt index files with the training and test file names.

- load_doc() loads, reads, and saves the text to the memory.
- load_set() loads the photo file names and saves them to the memory.

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

Loading cleaned descriptors and file names, and returns a list of descriptors to file names. 

load_clean_descriptions() loads up the index that contains the list of image filenames and corresponding descriptors. Splits the filename from the description, and then saves them to a dictionary with the descriptions as the values and filenames as the keys. Each description is wrapped with 'startseq' at the start and 'endseq' at the end. 

In [3]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq' + ' '.join(image_desc) + 'endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

A function that loads the photo features, from the photo filenames that are listed in the index provided in the argument 'dataset'.

In [4]:
# load training photo features, filtered by the index 'dataset'
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'r+b'))
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features

Loading the above functions with the descriptions, & training filenames index. 

Also, the extracted features pickle from the pre-processing with the training filenames index

In [5]:
# load training dataset (6K)
filename = './flicker8k-dataset/Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Captions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('./features_xception.pkl', train)
print('Photos: train=%d' % len(train_features))

Dataset: 6000
Captions: train=6000
Photos: train=6000


### Creating the tonkenizer.pkl file

to_lines() takes the descriptions dictionary, splits the filename from the descriptions and saves the descriptions it to a list.

create_tokenizer() takes the list of descriptions and fits it to the Keras Tokenizer(). After fitting the tokenizer we have a vocabulary size of 7579.

In [6]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 6652


Tokenizing the descriptions from the test images. 

Calculates the maximum length of a caption within the training dataset, by splitting each caption into a seperate line and returning the maximum length of one. 

In [7]:
# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

# determine maximum length of sequences
max_length = max_length(train_descriptions)
print('Maximum Description Length:', max_length)

Maximum Description Length: 38


create_sequences() takes the dictionary of the filenames and all captions corresponding to the image, and tokenizers each of them. 

It then appends all but one tokenized caption to an array (X2) used as the training input for the model. During this process, we use pad_sequences() to extend all the caption to the max_length by added values to them. Whilst the remaining caption is appended to the output array(y), and uses to_categorical() to convert it from a class vector to a binary vector ready to be used with 'categorical_crossentropy'.

The final array (X1), contains the training images features extracted by the VGG16 model in the pre-processing. 

In [8]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
    X1, X2, y = list(), list(), list()
    # iterate through each image identifier
    for key, desc_list in descriptions.items():
        # iterate through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    return array(X1), array(X2), array(y)

### Defining the model, and setting up to be fit. 

The feature extractor is defined to expect a 1 dimensional 4,096 element vector, using a dropout function weighting of 0.5. This is then processed through a dense layer with the activation function 'relu'. Relu is nice because its gradient is 1, which allows us to pass the maximum error through the network during back-propagation. 

The sequence processing expects an input of the maximum caption length (34 words). It is then run through an embedding function [a dense layer for text, as the traditional approach to text processing via bag-of-words is with sparse matrices] that expects the input dimensions of the vocab size [7577], produces an output with dimension of 256 elements and maps a mask over the padded values with zeros. It also includes a drop out function with a 0.5 weighting, followed by an LSTM layer with 256 memory units. 

Both text and image features extracted produce a 256 element vector. The decoder model merges the two input vectors together to a final output layer that makes a 'softmax' prediction.

softmax - turns 'logit scores'(numeric output of final neural layer within a classification network) into probability distributions of the label candidates.

In [9]:
# define the captioning model
def define_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(299, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 299, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(299)(se2)
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(299, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [10]:
# prepare training sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)

In [11]:
# load test set
filename = './flicker8k-dataset/Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features_xception.pkl', test)
print('Photos: test=%d' % len(test_features))
# prepare sequences
X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features, vocab_size)

Dataset: 1000
Descriptions: test=1000
Photos: test=1000


This model learns quickly and overfits, so checkpoints are made near the end of the epochs so that the best model can be observed. 

In [12]:
# define checkpoint callback
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [13]:
# define the model
model = define_model(vocab_size, max_length)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 38)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 38, 299)      1988948     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 2048)         0           input_1[0][0]                    
____________________________________________________________________________________________

In [14]:
# fit model
model.fit([X1train, X2train], ytrain, epochs=5, verbose=2, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 320853 samples, validate on 53155 samples
Epoch 1/5
 - 902s - loss: 4.1480 - val_loss: 3.6906

Epoch 00001: val_loss improved from inf to 3.69057, saving model to model-ep001-loss4.148-val_loss3.691.h5
Epoch 2/5
 - 858s - loss: 3.5465 - val_loss: 3.5561

Epoch 00002: val_loss improved from 3.69057 to 3.55607, saving model to model-ep002-loss3.546-val_loss3.556.h5
Epoch 3/5
 - 858s - loss: 3.3490 - val_loss: 3.4892

Epoch 00003: val_loss improved from 3.55607 to 3.48920, saving model to model-ep003-loss3.349-val_loss3.489.h5
Epoch 4/5
 - 858s - loss: 3.2387 - val_loss: 3.4811

Epoch 00004: val_loss improved from 3.48920 to 3.48107, saving model to model-ep004-loss3.239-val_loss3.481.h5
Epoch 5/5
 - 861s - loss: 3.1683 - val_loss: 3.4912

Epoch 00005: val_loss did not improve from 3.48107


<keras.callbacks.callbacks.History at 0x7fae97ff5b10>

## Using Progressive Loading

In [16]:
# data generator, intended to be used in a call to model.fit_generator()
    def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [[in_img, in_seq], out_word]

In [17]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

In [18]:
# test the data generator
generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(47, 4096)
(47, 34)
(47, 7579)


In [None]:
# train the model, run epochs manually and save after each epoch
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
    model.save('model_' + str(i) + '.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


## Evaluating the Model

We will evaluate a model by generating descriptions for all photos in the test dataset and evaluating those predictions with a standard cost function.

First, we need to be able to generate a description for a photo using a trained model.

This involves passing in the start description token ‘startseq‘, generating one word, then calling the model recursively with generated words as input until the end of sequence token is reached ‘endseq‘ or the maximum description length is reached.

In [15]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

The function below named evaluate_model() will evaluate a trained model against a given dataset of photo descriptions and photo features. The actual and predicted descriptions are collected and evaluated collectively using the corpus BLEU score that summarizes how close the generated text is to the expected text.

In [16]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [17]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

BLEU scores are used in text translation for evaluating translated text against one or more reference translations.

Here, we compare each generated description against all of the reference descriptions for the photograph. We then calculate BLEU scores for 1, 2, 3 and 4 cumulative n-grams.

In [18]:
# prepare and load the test set
filename = './flicker8k-dataset/Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

Dataset: 1000
Descriptions: test=1000
Photos: test=1000


In [19]:
# load the model
model = load_model('./models/Xception_model-ep004-loss3.239-val_loss3.481.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [37]:
# evaluate model
print('model-ep001')
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

model-ep001


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.447312
BLEU-2: 0.134160
BLEU-3: 0.051461
BLEU-4: 0.000000


In [35]:
# evaluate model
print('model-ep004')
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

model-ep004


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.426534
BLEU-2: 0.116206
BLEU-3: 0.026294
BLEU-4: 0.000000


In [28]:
evaluate_model(load_model('model_1.h5'), test_descriptions, test_features, tokenizer, max_length)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.514945
BLEU-2: 0.268975
BLEU-3: 0.179286
BLEU-4: 0.080684


In [29]:
evaluate_model(load_model('model_2.h5'), test_descriptions, test_features, tokenizer, max_length)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.501637
BLEU-2: 0.271278
BLEU-3: 0.181653
BLEU-4: 0.080465


In [30]:
evaluate_model(load_model('model_3.h5'), test_descriptions, test_features, tokenizer, max_length)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.495255
BLEU-2: 0.263729
BLEU-3: 0.176101
BLEU-4: 0.075213


In [31]:
evaluate_model(load_model('model_4.h5'), test_descriptions, test_features, tokenizer, max_length)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.495782
BLEU-2: 0.258507
BLEU-3: 0.171344
BLEU-4: 0.071539


In [32]:
evaluate_model(load_model('model_19.h5'), test_descriptions, test_features, tokenizer, max_length)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.498315
BLEU-2: 0.260570
BLEU-3: 0.174958
BLEU-4: 0.074751


In [24]:
print('Model:model-ep004-loss3.815-val_loss4.299.h5')
evaluate_model(load_model('model-ep004-loss3.815-val_loss4.299.h5'), test_descriptions, test_features, tokenizer, max_length)

Model:model-ep004-loss3.815-val_loss4.299.h5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.161576
BLEU-2: 0.082715
BLEU-3: 0.053656
BLEU-4: 0.016147


In [21]:
print('Xception_model-ep004-loss3.239-val_loss3.481')
evaluate_model(load_model('./models/Xception_model-ep004-loss3.239-val_loss3.481.h5'),test_descriptions, test_features, tokenizer, max_length)

Xception_model-ep004-loss3.239-val_loss3.481


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


ValueError: Error when checking input: expected input_1 to have shape (2048,) but got array with shape (4096,)