In [1]:
from configs.default import _C as config # this means configs/default.py and _C is inside that python file
from configs.default import update_config # likewise the function update_config

from datasets import flickr8k_parse

from keras import backend as K
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import RMSprop
from keras.backend.tensorflow_backend import set_session
from models import batch_generator, decoder
from keras.utils.vis_utils import plot_model

import json
import numpy as np
import os
import pandas as pd
import path_generation
import tensorflow as tf
import text_processing
import time

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 9629155014729828496]

In [3]:
config_file = "./configs/attn.yaml"
update_config(config, config_file) # config is imported from default.py _C above
# most parameters are in default.py, parameters in attn.yaml overwrite these defaults

# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

### COCO dataset

In [4]:
if config.DATASET == 'Coco':
    if config.ATTENTION:
        features_file_train = "vgg16_coco_train_attn.npy"
        features_file_val = "vgg16_coco_val_attn.npy"
    else:
        features_file_train = "vgg16_coco_train.npy"
        features_file_val = "vgg16_coco_val.npy"
    
    
    val_filenames_with_captions = coco_parse.get_image_filename_with_caption(config.PATH.ANNOTATIONS_PATH, 
                                                                             config.PATH.IMG_PATH, 
                                                                             train=False)

    val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

    train_filenames_with_captions = coco_parse.get_image_filename_with_caption(config.PATH.ANNOTATIONS_PATH, 
                                                                               config.PATH.IMG_PATH,
                                                                               train=True)
    train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

    ### Extract captions
    train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
    val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

### Flickr8k dataset

In [4]:
if config.DATASET == 'Flickr8k':
    if config.ATTENTION:
        features_file_train = "vgg16_flickr8k_train_attn.npy"
        features_file_val = "vgg16_flickr8k_val_attn.npy"
    else:
        features_file_train = "vgg16_flickr8k_train.npy"
        features_file_val = "vgg16_flickr8k_val.npy"

    captions_file = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr8k.token.txt")
    train_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.trainImages.txt")
    dev_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.devImages.txt")
    test_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.testImages.txt")    
        
    filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, 
                                                                                      config.PATH.IMG_PATH)
    train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, 
                                                                    filenames_with_all_captions,
                                                                    config.PATH.IMG_PATH)
    val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, 
                                                                  filenames_with_all_captions, 
                                                                  config.PATH.IMG_PATH)
    test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, 
                                                                   filenames_with_all_captions, 
                                                                   config.PATH.IMG_PATH)

# train_captions, val_captions are just the captions. "_filenames_with_all_captions have jpg filenames too."
    train_captions = flickr8k_parse.make_list_of_captions(train_filenames_with_all_captions)
    val_captions = flickr8k_parse.make_list_of_captions(val_filenames_with_all_captions)

In [5]:
print('number of rows in val_captions',len(val_captions),'\n number of captions in each row', \
      len(val_captions[0]),'\n val_captions[0:5]',val_captions[0:5])

number of rows in val_captions 1000 
 number of captions in each row 5 
 val_captions[0:5] [['the boy laying face down on a skateboard is being pushed along the ground by another boy .', 'Two girls play on a skateboard in a courtyard .', 'Two people play on a long skateboard .', 'Two small children in red shirts playing on a skateboard .', 'two young children on a skateboard going across a sidewalk'], ['a boy in a blue top is jumping off some rocks in the woods .', 'A boy jumps off a tan rock .', 'A boy jumps up in a field in the woods .', 'A young boy jumps off a rock in the forest', 'Child in blue and grey shirt jumping off hill in the woods'], ['A lady walking her dog through an obstacle course , while other people are in the background .', 'A small tan and white dog and trainer running an obstacle course', 'A woman is guiding a brown dog around an obstacle course', 'A woman with a hat is leading a small dog through an obstacle course .', 'The woman is leading a dog through an obsta

In [6]:
### Preprocess captions
text_processing.preprocess_captions(val_captions)
text_processing.preprocess_captions(train_captions)

In [7]:
### Add markers of captions' starts and ends
text_processing.add_start_and_end_to_captions(train_captions)
text_processing.add_start_and_end_to_captions(val_captions)

In [8]:
### Create vocabulary from the training captions
train_vocab = text_processing.Vocabulary()
for caption_list in train_captions:
    for caption in caption_list:
        tmp_caption_list = caption.split() # make a python list of all words in caption
        for word in tmp_caption_list:
            train_vocab.add_word(word) # add_word is a method on class Vocabulary

In [9]:
if not os.path.exists(config.PATH.VOCABULARY_PATH):
    os.mkdir(config.PATH.VOCABULARY_PATH)
train_vocab.save_vocabulary(config.VOCABULARY.WORD_TO_ID, config.VOCABULARY.ID_TO_WORD, config.VOCABULARY.COUNT)

In [10]:
train_captions_tokens = text_processing.tokenise_captions(train_captions, train_vocab)
val_captions_tokens = text_processing.tokenise_captions(val_captions, train_vocab)

In [11]:
print('train_captions_tokens[0]',train_captions_tokens[0])
print(train_captions[0])
print('len(train_captions_tokens)',len(train_captions_tokens),len(train_captions_tokens[0]),len(train_captions_tokens[0][0]))
print('len(train_captions)',len(train_captions),len(train_captions[0][0]))

train_captions_tokens[0] [[1, 2, 3, 4, 5, 6, 7, 2, 8, 4, 9, 10, 11, 12], [1, 3, 4, 13, 14, 4, 15, 11, 12], [1, 16, 17, 18, 19, 20, 21, 10, 22, 23, 12], [1, 16, 17, 24, 25, 9, 10, 11, 12], [1, 16, 17, 6, 15, 2, 26, 27, 28, 29, 30, 12]]
['<sos> a black dog is running after a white dog in the snow <eos>', '<sos> black dog chasing brown dog through snow <eos>', '<sos> two dogs chase each other across the snowy ground <eos>', '<sos> two dogs play together in the snow <eos>', '<sos> two dogs running through a low lying body of water <eos>']
len(train_captions_tokens) 6000 5 14
len(train_captions) 6000 64


In [12]:
train_captions_tokens[0]
# train_captions[0]

[[1, 2, 3, 4, 5, 6, 7, 2, 8, 4, 9, 10, 11, 12],
 [1, 3, 4, 13, 14, 4, 15, 11, 12],
 [1, 16, 17, 18, 19, 20, 21, 10, 22, 23, 12],
 [1, 16, 17, 24, 25, 9, 10, 11, 12],
 [1, 16, 17, 6, 15, 2, 26, 27, 28, 29, 30, 12]]

### Decoder NN

### GRU

In [13]:
path_gen = path_generation.PathGenerator(config.DECODER.GRU, # note that _C in default.py has been imported as config.
                                         config.DATASET, 
                                         config.DECODER.NUM_RNN_LAYERS, 
                                         config.DECODER.BATCH_SIZE, 
                                         config.DECODER.BATCH_NORM, 
                                         config.DECODER.DROPOUT, 
                                         config.ATTENTION, 
                                         config.DECODER.ATTN_TYPE)

path_checkpoint = path_gen.get_weights_path()
model_path = path_gen.get_model_path()
callbacks_path = path_gen.get_callbacks_path()

In [14]:
features_file_train_path = os.path.join(config.PATH.FEATURES_PATH, features_file_train)
features_file_val_path = os.path.join(config.PATH.FEATURES_PATH, features_file_val)

transfer_values = np.load(features_file_train_path) # loads from .npy or .npz file
print('transfer_values.shape',transfer_values.shape)
print('transfer_values[121,0:45]',transfer_values[121,0:45])
val_transfer_values = np.load(features_file_val_path)
print('val_transfer_values.shape',val_transfer_values.shape)

transfer_values.shape (6000, 14, 14, 512)
transfer_values[121,0:45] [[[0.05983141 0.         0.         ... 0.         0.7467803  0.        ]
  [0.10442781 0.         0.         ... 0.         0.58881134 0.        ]
  [0.00977734 0.         0.         ... 0.         0.681288   0.        ]
  ...
  [0.         0.         0.         ... 0.         0.71781266 0.        ]
  [0.04952595 0.         0.         ... 0.         0.79048127 0.        ]
  [0.13595301 0.         0.         ... 0.         0.7541306  0.        ]]

 [[0.         0.         0.         ... 0.         0.70611817 0.        ]
  [0.         0.         0.         ... 0.         0.36244947 0.        ]
  [0.         0.         0.         ... 0.         0.52016246 0.        ]
  ...
  [0.         0.         0.         ... 0.         0.5099765  0.        ]
  [0.         0.         0.         ... 0.         0.5016797  0.        ]
  [0.         0.         0.         ... 0.         0.6678728  0.        ]]

 [[0.         0.         0. 

In [15]:
if config.ATTENTION:
    print(transfer_values.shape)
    transfer_values = transfer_values.reshape(len(train_filenames_with_all_captions), transfer_values.shape[1] ** 2, -1)
    val_transfer_values = val_transfer_values.reshape(len(val_filenames_with_all_captions), val_transfer_values.shape[1] ** 2, -1)
    print(transfer_values.shape)

(6000, 14, 14, 512)
(6000, 196, 512)


In [16]:
decoder_model = decoder.Decoder(config.DECODER.INITIAL_STATE_SIZE, # these are declared in configs > default.py
                                config.DECODER.EMBEDDING_OUT_SIZE,
                                config.DECODER.NUM_RNN_LAYERS,
                                config.DECODER.GRU,
                                config.DECODER.BATCH_NORM,
                                config.DECODER.DROPOUT,
                                config.ATTENTION,
                                config.DECODER.ATTN_TYPE,
                                transfer_values,
                                train_vocab)
decoder_model = decoder_model.build_model()
# plot_model(decoder_model, to_file='decoder_model.png')
# print('decoder_model.summary()',decoder_model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [17]:
if config.DECODER.GRU:
    generator = batch_generator.generate_batch(transfer_values, 
                                               train_captions_tokens, 
                                               number_of_words=train_vocab.number_of_words, 
                                               batch_size=config.DECODER.BATCH_SIZE)
    val_generator = batch_generator.generate_batch(val_transfer_values, 
                                                   val_captions_tokens, 
                                                   number_of_words=train_vocab.number_of_words, 
                                                   batch_size=config.DECODER.BATCH_SIZE)
else:
    generator = batch_generator.generate_batch(transfer_values, 
                                               train_captions_tokens, 
                                               number_of_words=train_vocab.number_of_words, 
                                               batch_size=config.DECODER.BATCH_SIZE, 
                                               gru=config.DECODER.GRU)
    
    val_generator = batch_generator.generate_batch(val_transfer_values, 
                                                   val_captions_tokens, 
                                                   number_of_words=train_vocab.number_of_words, 
                                                   batch_size=config.DECODER.BATCH_SIZE, 
                                                   gru=config.DECODER.GRU)              

In [18]:
if config.DECODER.OPTIMIZER:
    optimizer = RMSprop(lr=config.DECODER.LR, decay=config.DECODER.DECAY)

In [19]:
decoder_model.compile(optimizer=optimizer,
                      loss=config.DECODER.LOSS)
print('decoder_model.summary()',decoder_model.summary())



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 196, 512)     0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 512)          0           encoder_input[0][0]              
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 512)          262656      lambda_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 512)          2048        dense_5[0][0]                    
__________________________________________________________________________________________________
lambda_3

In [20]:
model_json = decoder_model.to_json()
try:
    os.mkdir(config.PATH.MODELS_ARCHITECTURE_PATH)
except:
    print('The folder already exists')
with open(model_path, "w") as json_file:
    json.dump(json.loads(model_json), json_file, indent=4)

The folder already exists


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They 

### Checkpoints

During the training process, it is a good idea to save the weights periodically.

In [21]:
try:
    os.mkdir(configs.WEIGHTS_PATH)
except:
    print('The folder already exists')

checkpoints = ModelCheckpoint(path_checkpoint, 
                              verbose=config.DECODER.VERBOSE, 
                              save_weights_only=True, 
                              save_best_only=config.DECODER.SAVE_BEST)

reduce_lr = ReduceLROnPlateau(monitor=config.DECODER.MONITOR, 
                              factor=config.DECODER.FACTOR,
                              patience=config.DECODER.PATIENCE, 
                              verbose=config.DECODER.VERBOSE, 
                              min_lr=config.DECODER.MIN_LR)

The folder already exists


In [22]:
tf_configuration = tf.ConfigProto()
tf_configuration.gpu_options.allow_growth = True
set_session(tf.Session(config=tf_configuration))
start = time.time()
callbacks = decoder_model.fit_generator(generator=generator,
                                        steps_per_epoch=int(len(train_filenames_with_all_captions) / config.DECODER.BATCH_SIZE),
                                        epochs=config.DECODER.EPOCHS,
                                        callbacks=[checkpoints, reduce_lr],
                                        validation_data=val_generator,
                                        validation_steps=config.DECODER.VAL_STEPS)
time_train = time.time() - start

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/20
transfer_values1.shape (1000, 196, 512)
len(captions_tokens) transfer_values1.shape (6000, 196, 512)
len(captions_tokens) 6000
len(captions_tokens[0]) 5
1000
len(captions_tokens[0]) 5





Epoch 00001: val_loss improved from inf to 1.85448, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_attn_bahdanau.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.85448 to 1.59383, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_attn_bahdanau.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 1.59383 to 1.43705, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_attn_bahdanau.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 1.43705 to 1.40360, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_attn_bahdanau.hdf5
Epoch 5/20

Epoch 00005: val_loss did not improve from 1.40360
Epoch 6/20

Epoch 00006: val_loss improved from 1

In [25]:
print("Time for training: {} seconds".format(time_train))

Time for training: 14048.330514669418 seconds


In [26]:
if not os.path.exists(config.PATH.CALLBACKS_PATH):
    os.mkdir(config.PATH.CALLBACKS_PATH)   
callback_df = pd.DataFrame(callbacks.history)
callback_df.to_csv(callbacks_path, index=None)