# Image encoding

Currently, an image encoder is built using VGG16 architecture pre-trained on imagenet database.

The features were obtained from the "fc2" layer - last fully-connected layer before the predictions layer.

Generated features in numpy arrays for both training and validation datasets were saved to .npy files.

In [1]:
from configs.default import _C as config # SYNTAX N.B. this imports configs > default.py 
from configs.default import update_config

from datasets import coco_parse
from datasets import flickr8k_parse

from keras import Model 
from keras.applications import VGG16
from tensorflow.python.client import device_lib

from models import image_preprocessing, transfer_models

import numpy as np
import os
import time

Using TensorFlow backend.


## Load the current configuration parameters.

In [2]:
config_file = "./configs/attn.yaml"
update_config(config, config_file)

## COCO dataset is not currently supported. You can ignore this cell.

In [3]:
if config.DATASET == 'Coco':
    if config.ATTENTION:
        features_file_train = "vgg16_coco_train_attn.npy"
        features_file_val = "vgg16_coco_val_attn.npy"
        features_file_test = "vgg16_coco_test_attn.npy"
    else:
        features_file_train = "vgg16_coco_train.npy"
        features_file_val = "vgg16_coco_val.npy"
        features_file_test = "vgg16_coco_test.npy"
    
    
    val_filenames_with_captions = coco_parse.get_image_filename_with_caption(config.PATH.ANNOTATIONS_PATH, 
                                                                             config.PATH.IMG_PATH, 
                                                                             train=False)

    val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

    train_filenames_with_captions = coco_parse.get_image_filename_with_caption(config.PATH.ANNOTATIONS_PATH, 
                                                                               config.PATH.IMG_PATH,
                                                                               train=True)
    train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

### Flickr8k dataset

In [3]:
if config.DATASET == 'Flickr8k':
    print("config.PATH.ANNOTATIONS_PATH",config.PATH.ANNOTATIONS_PATH)
    captions_file = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr8k.token.txt")
    train_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.trainImages.txt")
    dev_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.devImages.txt")
    test_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.testImages.txt")
    
    if config.ATTENTION: # these are output files in numpy save binary format
        features_file_train = "vgg16_flickr8k_train_attn.npy"
        features_file_val = "vgg16_flickr8k_val_attn.npy"
        features_file_test = "vgg16_flickr8k_test_attn.npy"
    else:
        features_file_train = "vgg16_flickr8k_train.npy"
        features_file_val = "vgg16_flickr8k_val.npy"
        features_file_test = "vgg16_flickr8k_test.npy"

    filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, 
                                                                                      config.PATH.IMG_PATH)

# these are three dictionaries that are subsets of filenames_with_all_captions dictionary
# they are created based on the jpg names in Flickr_8k.trainImages.txt, etc.

    train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, 
                                                                    filenames_with_all_captions,
                                                                    config.PATH.IMG_PATH)
    val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, 
                                                                  filenames_with_all_captions, 
                                                                  config.PATH.IMG_PATH)
    test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, 
                                                                   filenames_with_all_captions, 
                                                                   config.PATH.IMG_PATH)

config.PATH.ANNOTATIONS_PATH ./datasets/Flickr8k/annotations/


# this cell is for testing only

In [4]:
print('Number of images in validation dataset: {}'.format(len(val_filenames_with_all_captions)))
print('Number of images in training dataset: {}'.format(len(train_filenames_with_all_captions)))
n=15
n1=-1
# [print(v) for i, v in enumerate(val_filenames_with_all_captions.items()) if i < n]
for (key, values) in val_filenames_with_all_captions.items():
    for value in values:
# prints key (filename) over and over with successive captions
        n1+=1
        if n1 < n:
            print (n1,key,value)

Number of images in validation dataset: 1000
Number of images in training dataset: 6000
0 ./datasets/Flickr8k/Images/2090545563_a4e66ec76b.jpg the boy laying face down on a skateboard is being pushed along the ground by another boy .
1 ./datasets/Flickr8k/Images/2090545563_a4e66ec76b.jpg Two girls play on a skateboard in a courtyard .
2 ./datasets/Flickr8k/Images/2090545563_a4e66ec76b.jpg Two people play on a long skateboard .
3 ./datasets/Flickr8k/Images/2090545563_a4e66ec76b.jpg Two small children in red shirts playing on a skateboard .
4 ./datasets/Flickr8k/Images/2090545563_a4e66ec76b.jpg two young children on a skateboard going across a sidewalk
5 ./datasets/Flickr8k/Images/3393035454_2d2370ffd4.jpg a boy in a blue top is jumping off some rocks in the woods .
6 ./datasets/Flickr8k/Images/3393035454_2d2370ffd4.jpg A boy jumps off a tan rock .
7 ./datasets/Flickr8k/Images/3393035454_2d2370ffd4.jpg A boy jumps up in a field in the woods .
8 ./datasets/Flickr8k/Images/3393035454_2d237

In [5]:
### encode features for validation images
start = time.time()
printswitch=False
val_transfer_values = transfer_models.use_pretrained_model_for_images(val_filenames_with_all_captions,
                                                                      config.ATTENTION,printswitch, 
                                                                      batch_size=config.ENCODER.BATCH_SIZE)
time_val = time.time() - start

use_pretrained_model_for_image - filenames_with_all_captions ./datasets/Flickr8k/Images/2090545563_a4e66ec76b.jpg 
 ['the boy laying face down on a skateboard is being pushed along the ground by another boy .', 'Two girls play on a skateboard in a courtyard .', 'Two people play on a long skateboard .', 'Two small children in red shirts playing on a skateboard .', 'two young children on a skateboard going across a sidewalk']










_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None

filename ./datasets/Flickr8k/Images/3163068926_d28ed3ef53.jpg
40% of images processed
filename ./datasets/Flickr8k/Images/2598012140_832863fcb9.jpg
filename ./datasets/Flickr8k/Images/3154152744_4e93ec8a62.jpg
filename ./datasets/Flickr8k/Images/3331525712_af1dcc47f2.jpg
45% of images processed
filename ./datasets/Flickr8k/Images/1370773415_967b1ffde1.jpg
filename ./datasets/Flickr8k/Images/3623650392_7b75d4de21.jpg
filename ./datasets/Flickr8k/Images/1056249424_ef2a2e041c.jpg
50% of images processed
filename ./datasets/Flickr8k/Images/2248487956_2603f55ab9.jpg
filename ./datasets/Flickr8k/Images/2909081008_9a7bfc599a.jpg
filename ./datasets/Flickr8k/Images/3520869880_2e8b7d7842.jpg
55% of images processed
filename ./datasets/Flickr8k/Images/1346051107_9cdc14e070.jpg
filename ./datasets/Flickr8k/Images/425518464_a18b87c563.jpg
filename ./datasets/Flickr8k/Images/241346105_c1c860db0d.jpg
60% of images processed
filename ./datasets/Flickr8k/Images/3098336319_a7e5b061d0.jpg
filename ./dat

In [6]:
print('Validation dataset encoding took {:.1f} minutes'.format(time_val / 60))

Validation dataset encoding took 4.0 minutes


In [7]:
### save features for validation images
print("val_transfer_values",val_transfer_values)
transfer_models.save_features(val_transfer_values, config.PATH.FEATURES_PATH, features_file_val)

val_transfer_values [[[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    6.66217864e-01 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    3.17280471e-01 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    4.21574146e-01 0.00000000e+00]
   ...
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    1.48402369e+00 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    1.09586990e+00 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    7.49951601e-01 0.00000000e+00]]

  [[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    4.33700800e-01 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    0.00000000e+00 0.00000000e+00]
   [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    2.08732486e-01 0.00000000e+00]
   ...
   [0.00000000e+00 0.00000000e+00 0.0

Array was saved to ./cnn_features/vgg16_flickr8k_val_attn.npy


In [8]:
### encode features for training images
print("val_transfer_values.shape",val_transfer_values.shape)
start = time.time()
printswitch=False
train_transfer_values = transfer_models.use_pretrained_model_for_images(train_filenames_with_all_captions, 
                                                                        config.ATTENTION,printswitch, 
                                                                        batch_size=config.ENCODER.BATCH_SIZE)
time_train = time.time() - start

val_transfer_values.shape (1000, 14, 14, 512)
use_pretrained_model_for_image - filenames_with_all_captions ./datasets/Flickr8k/Images/2513260012_03d33305cf.jpg 
 ['A black dog is running after a white dog in the snow .', 'Black dog chasing brown dog through snow', 'Two dogs chase each other across the snowy ground .', 'Two dogs play together in the snow .', 'Two dogs running through a low lying body of water .']
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0

filename ./datasets/Flickr8k/Images/2075493556_b763648389.jpg
filename ./datasets/Flickr8k/Images/375384566_254c2362d4.jpg
filename ./datasets/Flickr8k/Images/253762507_9c3356c2f6.jpg
filename ./datasets/Flickr8k/Images/241345446_2e47ae8ddc.jpg
filename ./datasets/Flickr8k/Images/3389321512_b11f499dab.jpg
15% of images processed
filename ./datasets/Flickr8k/Images/162152393_52ecd33fc5.jpg
filename ./datasets/Flickr8k/Images/504385521_6e668691a3.jpg
filename ./datasets/Flickr8k/Images/3432495898_a5859f06b6.jpg
filename ./datasets/Flickr8k/Images/3210705660_2b14b7fb36.jpg
filename ./datasets/Flickr8k/Images/3332136681_9aecf101fd.jpg
filename ./datasets/Flickr8k/Images/798343627_7492fe0c12.jpg
filename ./datasets/Flickr8k/Images/2875583266_4da13ae12d.jpg
filename ./datasets/Flickr8k/Images/3098824948_23c31df031.jpg
filename ./datasets/Flickr8k/Images/3309578722_1765d7d1af.jpg
filename ./datasets/Flickr8k/Images/2856456013_335297f587.jpg
filename ./datasets/Flickr8k/Images/210839948_bbd5bf

filename ./datasets/Flickr8k/Images/3377344932_6dfce93248.jpg
50% of images processed
filename ./datasets/Flickr8k/Images/3271468462_701eb88d3b.jpg
filename ./datasets/Flickr8k/Images/3264397357_72f084cac1.jpg
filename ./datasets/Flickr8k/Images/1198194316_543cc7b945.jpg
filename ./datasets/Flickr8k/Images/99171998_7cc800ceef.jpg
filename ./datasets/Flickr8k/Images/2103361407_4ed4fc46bf.jpg
filename ./datasets/Flickr8k/Images/3401039304_424ffc7dbf.jpg
filename ./datasets/Flickr8k/Images/223299137_b0e81ac145.jpg
filename ./datasets/Flickr8k/Images/3182996527_70d9c323d5.jpg
filename ./datasets/Flickr8k/Images/2458006588_754c4aa09c.jpg
filename ./datasets/Flickr8k/Images/1311132744_5ffd03f831.jpg
filename ./datasets/Flickr8k/Images/374176648_ba4b88c221.jpg
filename ./datasets/Flickr8k/Images/997338199_7343367d7f.jpg
filename ./datasets/Flickr8k/Images/2992808092_5f677085b7.jpg
filename ./datasets/Flickr8k/Images/2943079526_e9033a6556.jpg
filename ./datasets/Flickr8k/Images/3468275336_6193

filename ./datasets/Flickr8k/Images/1413956047_c826f90c8b.jpg
filename ./datasets/Flickr8k/Images/539761097_5c6c70425b.jpg
filename ./datasets/Flickr8k/Images/3262647146_a53770a21d.jpg
filename ./datasets/Flickr8k/Images/2738255684_0324ed062d.jpg
filename ./datasets/Flickr8k/Images/3474999131_788cbf253f.jpg
filename ./datasets/Flickr8k/Images/2198964806_c57b0534d3.jpg
filename ./datasets/Flickr8k/Images/2310233145_910cb5b4c8.jpg
filename ./datasets/Flickr8k/Images/2999735171_87ca43c225.jpg
filename ./datasets/Flickr8k/Images/2064417101_3b9d817f4a.jpg
filename ./datasets/Flickr8k/Images/2100909581_b7dde5b704.jpg
filename ./datasets/Flickr8k/Images/103195344_5d2dc613a3.jpg
filename ./datasets/Flickr8k/Images/224702241_05af393148.jpg
filename ./datasets/Flickr8k/Images/3655326478_4472c5c630.jpg
filename ./datasets/Flickr8k/Images/1167662968_e466f1e80a.jpg
filename ./datasets/Flickr8k/Images/3724738804_f00748a137.jpg
90% of images processed
filename ./datasets/Flickr8k/Images/3159569570_df

In [None]:
print('Training dataset encoding took {:.1f} minutes'.format(time_train / 60))

In [None]:
transfer_models.save_features(train_transfer_values, config.PATH.FEATURES_PATH, features_file_train)

In [None]:
### encode features for training images
start = time.time()
printswitch=True
test_transfer_values = transfer_models.use_pretrained_model_for_images(test_filenames_with_all_captions, 
                                                                       config.ATTENTION,printswitch, 
                                                                       batch_size=config.ENCODER.BATCH_SIZE)
print("test_transfer_values.shape",test_transfer_values.shape)
time_train = time.time() - start

In [None]:
transfer_models.save_features(test_transfer_values, config.PATH.FEATURES_PATH, features_file_test)