In [1]:
from keras import backend as K
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, GRU
from keras.layers.merge import add
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg19 import VGG19, preprocess_input
from keras.callbacks import ModelCheckpoint

from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

import numpy as np
import glob
import h5py
import string
import pickle

from os import listdir
from os.path import join, isdir, isfile, exists

Using TensorFlow backend.


In [2]:
meta_info = {
    'input_shape': {
        'VGG16': 4096, 'VGG19': 4096, 'ResNet50': 4096,
        'InceptionV3': 2048,
        'InceptionResNetV2': 1536
    },
    'n_embeddeing': 512,
    'text_dir': 'Flickr8k_text/',
    # It is better to input the value resulting from data processing.
    # Automatic input at the stage of preparing the data, without having to input it.
    'n_vocabs': None,
    'M': None
}

In [3]:
"""
Load data from dataset.
Use squeezing to make it easier to put into model.

X: image feature
Y: caption sequence
Z: caption sequence next word
"""
def load_data(dataset):
    if dataset == 'train':
        data_file = 'Flickr8k_text/Flickr_8k.trainImages.txt'
        features_file = 'train_features.h5'
        sequences_file = 'train_sequences.h5'
        next_word_file = 'train_next_word.h5'
    elif dataset == 'dev':
        data_file = 'Flickr8k_text/Flickr_8k.devImages.txt'
        features_file = 'dev_features.h5'
        sequences_file = 'dev_sequences.h5'
        next_word_file = 'dev_next_word.h5'
    elif dataset == 'test':
        data_file = 'Flickr8k_text/Flickr_8k.testImages.txt'
        features_file = 'test_features.h5'
        sequences_file = 'test_sequences.h5'
        next_word_file = 'test_next_word.h5'

    features = h5py.File(features_file, 'r')
    sequences = h5py.File(sequences_file, 'r')
    next_word = h5py.File(next_word_file, 'r')

    X = list()
    Y = list()
    Z = list()

    with open(data_file, 'r') as f:
        contents = f.read()
        c = 0
        for line in contents.split('\n'):
            if line == '': # last line or error line
                print(c)
                continue

            file = line.split('.')[0]

            x = features[file][:]
            y = sequences[file][:]
            z = next_word[file][:]

            for i in range(len(y)):
                if c % 10000 == 0:
                    print(c)
                X.append(x)
                Y.append(y[i])
                Z.append(z[i])
                c += 1
    features.close()
    sequences.close()
    next_word.close()

    return np.array(X).squeeze(), np.array(Y).squeeze(), np.array(Z).squeeze()

In [4]:
x_dev, y_dev, z_dev = load_data('dev')

0
10000
20000
30000
40000
50000
58661


In [5]:
# Data load test
k = 0
print(x_dev.shape)
print(x_dev[k].shape)
print(y_dev.shape)
print(y_dev[k].shape)
print(z_dev.shape)
print(z_dev[k].shape)

print(y_dev[k])
print(z_dev[k][:10])

(58661, 4096)
(4096,)
(58661, 36)
(36,)
(58661, 7277)
(7277,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [6]:
# Fill in meta info
meta_info['M'] = y_dev.shape[1]
meta_info['n_vocabs'] = z_dev.shape[1]
print(meta_info)

{'n_embeddeing': 512, 'M': 36, 'n_vocabs': 7277, 'input_shape': {'VGG19': 4096, 'InceptionV3': 2048, 'InceptionResNetV2': 1536, 'VGG16': 4096, 'ResNet50': 4096}, 'text_dir': 'Flickr8k_text/'}


In [7]:
"""
I have tried various layer settings and optimizers, but the following model was the best.
The paper refers to the LSTM size is set to 512,
but in my experiment, the bleu score and learning speed are better when the LSTM size is 256 or 384.
Adam optimizer is the best performance.
"""
def make_model(n_lstm_units):
    # input1, input2 are encoder
    # Image feature
    input1 = Input(shape=(meta_info['input_shape']['VGG19'],))
    dropout1 = Dropout(0.5)(input1)
    fc1 = Dense(n_lstm_units, activation='relu')(dropout1)
    
    # Caption
    input2 = Input(shape=(meta_info['M'],))
    # In this paper, specified embedding vector size as 512.
    embedded_layer1 = Embedding(meta_info['n_vocabs'], meta_info['n_embeddeing'], mask_zero=True)(input2)
    dropout2 = Dropout(0.5)(embedded_layer1)
    lstm1 = LSTM(n_lstm_units)(dropout2)
    
    # Decoder
    fc2 = add([fc1, lstm1])
    fc3 = Dense(n_lstm_units, activation='relu')(fc2)
    outputs = Dense(meta_info['n_vocabs'], activation='softmax')(fc3)
    
    # Inputs are X, Y, and ouput is Z
    model = Model(inputs=[input1, input2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [8]:
# BLEU score test
# ref: https://stackoverflow.com/questions/40542523/nltk-corpus-level-bleu-vs-sentence-level-bleu-score

def bleu_score_test(model, test_captions, features, tokenizer):
    y_true, y_pred = list(), list()
    # step over the whole set
    c = 0
    for img_id, captions in test_captions.items():
        if c % 100 == 0:
            print(c)
        # gererate caption
        generated = generate_caption(model, tokenizer, features[img_id])
        word_true = [caption.split() for caption in captions]
        y_true.append(word_true)
        y_pred.append(generated.split())
        c += 1
    # BLEU score test
    print('BLEU-1: %f' % corpus_bleu(y_true, y_pred, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(y_true, y_pred, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(y_true, y_pred, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(y_true, y_pred, weights=(0.25, 0.25, 0.25, 0.25)))

    # sentence_bleu equal score BLEU-4
#     print('BLEU-sentence: %f' % sentence_bleu(y_true, y_pred)

### Inference by using the sampling method.(or k=1 Beam search)
#### 

In [9]:
"""
Experimented with k=5 beam search, but the BLEU score is not much better from k=1.
Also when k=1, the implementation is very simple.
"""
# Generate caption from input feature
def generate_caption(model, tokenizer, feature):
    # start sign
    generated = '[CLS]'
    # Loop for max length or end sign('[SEP]')
    for i in range(meta_info['M']):
        sequence = tokenizer.texts_to_sequences([generated])[0]
        sequence = pad_sequences([sequence], maxlen=meta_info['M'])
        # predict next word
        y_pred = model.predict([feature, sequence], verbose=0)
        y_pred = np.argmax(y_pred)

        word_pred = '[SEP]'
        for word, i in tokenizer.word_index.items():
            if i == y_pred:
                word_pred = word 
        # Generate sentence
        generated += ' ' + word_pred
        # If end sign, break
        if word_pred == '[SEP]':
            break
    return generated

In [10]:
x_train, y_train, z_train = load_data('train')
print(x_train.shape)
print(y_train.shape)
print(z_train.shape)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
352425
(352425, 4096)
(352425, 36)
(352425, 7277)


In [11]:
model = make_model(256)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 36)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 36, 512)      3725824     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)  

In [12]:
filepath = 'model.ep{epoch:02d}.val_acc{val_acc:.4f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

history = model.fit([x_train, y_train], z_train, epochs=20, batch_size=1024, verbose=1, callbacks=[checkpoint], validation_data=([x_dev, y_dev], z_dev))

Instructions for updating:
Use tf.cast instead.
Train on 352425 samples, validate on 58661 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.34120, saving model to model.ep01.val_acc0.3412.h5
Epoch 2/20

Epoch 00002: val_acc improved from 0.34120 to 0.36793, saving model to model.ep02.val_acc0.3679.h5
Epoch 3/20

Epoch 00003: val_acc improved from 0.36793 to 0.37422, saving model to model.ep03.val_acc0.3742.h5
Epoch 4/20

Epoch 00004: val_acc improved from 0.37422 to 0.38329, saving model to model.ep04.val_acc0.3833.h5
Epoch 5/20

Epoch 00005: val_acc improved from 0.38329 to 0.38404, saving model to model.ep05.val_acc0.3840.h5
Epoch 6/20

Epoch 00006: val_acc improved from 0.38404 to 0.38530, saving model to model.ep06.val_acc0.3853.h5
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.38530
Epoch 8/20

Epoch 00008: val_acc did not improve from 0.38530
Epoch 9/20

Epoch 00009: val_acc improved from 0.38530 to 0.38552, saving model to model.ep09.val_acc0.3855.h5
Epo

In [13]:
# Load to tokenizer
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
print(len(tokenizer.word_index))

7276


In [14]:
captions = dict()
words = set()

with open(join(meta_info['text_dir'], 'Flickr8k.token.txt')) as f:
    contents = f.read()
    n_captions = 0
    for line in contents.split('\n'):
        if line == '':
            print(n_captions)
            continue
        if n_captions % 10000 == 0:
            print(n_captions)
        
        file, caption = line.split('\t')
        
        table = str.maketrans('', '', string.punctuation)
        
        caption2 = []
        for word in caption.split():
            if word.isalpha():
                caption2.append(word.translate(table))
        caption = ' '.join(caption2)
        
        img_id = file.split('.')[0]
        
        if img_id in captions.keys():
            captions[img_id].append(caption)
        else:
            captions[img_id] = [caption]
        n_captions += 1

        [words.add(word) for word in caption.split()]
        
print('number of images: %d' % len(captions))
print('number of catpions: %d' % n_captions)
print('number of words: %d' % len(words))

0
10000
20000
30000
40000
40460
number of images: 8092
number of catpions: 40460
number of words: 9068


In [15]:
# train set caption test
print(captions['2513260012_03d33305cf'])
# dev set caption test
print(captions['2090545563_a4e66ec76b'])
# test set caption test
print(captions['3385593926_d3e9c21170'])

['A black dog is running after a white dog in the snow', 'Black dog chasing brown dog through snow', 'Two dogs chase each other across the snowy ground', 'Two dogs play together in the snow', 'Two dogs running through a low lying body of water']
['the boy laying face down on a skateboard is being pushed along the ground by another boy', 'Two girls play on a skateboard in a courtyard', 'Two people play on a long skateboard', 'Two small children in red shirts playing on a skateboard', 'two young children on a skateboard going across a sidewalk']
['The dogs are in the snow in front of a fence', 'The dogs play on the snow', 'Two brown dogs playfully fight in the snow', 'Two brown dogs wrestle in the snow', 'Two dogs playing in the snow']


In [16]:
test_captions = dict()

with open('Flickr8k_text/Flickr_8k.testImages.txt', 'r') as f:
    contents = f.read()
    n_captions = 0
    for line in contents.split('\n'):
        if line == '':
            print(n_captions)
            continue
        if n_captions % 1000 == 0:
            print(n_captions)
            
        file = line
        img_id = file.split('.')[0]
        test_captions[img_id] = []

        for caption in captions[img_id]:
            caption = '[CLS] ' + caption + ' [SEP]'
            caption = caption.replace('\n', '')
            test_captions[img_id].append(caption)
        n_captions += len(captions[img_id])
        
print('number of images: %d' % len(test_captions))
print('number of catpions: %d' % n_captions)

0
1000
2000
3000
4000
5000
number of images: 1000
number of catpions: 5000


In [17]:
# test set caption test
print(test_captions['3385593926_d3e9c21170'])

['sssss The dogs are in the snow in front of a fence eeeee', 'sssss The dogs play on the snow eeeee', 'sssss Two brown dogs playfully fight in the snow eeeee', 'sssss Two brown dogs wrestle in the snow eeeee', 'sssss Two dogs playing in the snow eeeee']


In [18]:
# BLEU scores test for base model
test_features_file = 'test_features.h5'
model_file = 'model.ep06.val_acc0.3853.h5'
with h5py.File(test_features_file, 'r') as h5f:
    test_features = h5f
    model = load_model(model_file)
    bleu_score_test(model, test_captions, test_features, tokenizer)

0
100
200
300
400
500
600
700
800
900
BLEU-1: 0.594751
BLEU-2: 0.361690
BLEU-3: 0.258544
BLEU-4: 0.138500


In [19]:
x_test, y_test, z_test = load_data('test')
print(x_test.shape)
print(y_test.shape)
print(z_test.shape)

0
10000
20000
30000
40000
50000
58389
(58389, 4096)
(58389, 36)
(58389, 7277)


In [20]:
filepath = 'transfer.model.ep{epoch:03d}.acc{acc:.4f}.h5'
checkpoint = ModelCheckpoint(filepath, verbose=1)

model = load_model('model.ep06.val_acc0.3853.h5')
# transfer learning, train -> dev, dev -> test
history = model.fit([x_dev, y_dev], z_dev, epochs=1, batch_size=1024, verbose=1, callbacks=[checkpoint], validation_data=([x_test, y_test], z_test))

Train on 58661 samples, validate on 58389 samples
Epoch 1/1

Epoch 00001: saving model to transfer.model.ep001.acc0.3872.h5


In [21]:
test_features_file = 'test_features.h5'
model_file = 'transfer.model.ep001.acc0.3872.h5'
with h5py.File(test_features_file, 'r') as h5f:
    test_features = h5f
    model = load_model(model_file)
    bleu_score_test(model, test_captions, test_features, tokenizer)

0
100
200
300
400
500
600
700
800
900
BLEU-1: 0.611284
BLEU-2: 0.370745
BLEU-3: 0.262829
BLEU-4: 0.136760
