## Flickr30k to Features

*   P. Young, A. Lai, M. Hodosh, and J. Hockenmaier. _From image description to visual denotations: New similarity metrics for semantic inference over event descriptions._ Transactions of the Association for Computational Linguistics (to appear).



In [None]:
import os
#import tensorflow as tf
import tensorflow.contrib.keras as keras
import numpy as np

In [None]:
data_path = './data/Flickr30k'
image_path = os.path.join(data_path, 'flickr30k-images')

In [None]:
from tensorflow.contrib.keras.api.keras.applications.inception_v3 import decode_predictions
from tensorflow.contrib.keras.api.keras.preprocessing import image as keras_preprocessing_image

In [None]:
from tensorflow.contrib.keras.api.keras.applications.inception_v3 import InceptionV3, preprocess_input

In [None]:
model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
print("InceptionV3 loaded")

#### Plan 

*  Have a look inside the captions ```flickr30k.tar.gz``` : includes ```results_20130124.token```
*  Extract contents of ```flickr30k.tar.gz``` to ```dict( photo_id -> [captions] )```
*  Filter out a subset of those ```photo_id``` to convert
*  Run InceptionV3 over the list
*  Save off features to an easy-to-load filetype


In [None]:
img_to_captions=dict()

tarfilepath = os.path.join(data_path, 'flickr30k.tar.gz')
if os.path.isfile(tarfilepath):
    import tarfile
    with tarfile.open(tarfilepath, 'r:gz').extractfile('results_20130124.token') as tokenized:
        n_captions = 0
        for l in tokenized.readlines():
            #print(l)  # This is bytes
            img_num, caption = l.decode("utf-8").strip().split("\t")
            img, num = img_num.split("#")
            #print(img, caption); break
            if img not in img_to_captions:  img_to_captions[img]=[]
            img_to_captions[img].append(caption)
            n_captions += 1
            
print("Found %d images, with a total of %d captions" % (len(img_to_captions),n_captions, ))

In [None]:
# Filter for the images that we care about
import re
good_caption = re.compile( r'\b(cat|kitten)s?\b', flags=re.IGNORECASE )
good_img_to_captions = { img:captions
                            for img, captions in img_to_captions.items() 
                            for caption in captions 
                            if good_caption.search( caption )
                       }  # img=='3947306345.jpg'
#good_img_to_captions
len(good_img_to_captions)

In [None]:
img_arr = sorted(good_img_to_captions.keys())

In [None]:
# Create a generator for preprocessed images
def preprocesed_image_gen():
    #target_size=model.input_shape[1:]
    target_size=(299, 299, 3)
    print("target_size", target_size)
    for img_name in img_arr:
        print("img_name", img_name)
        img_path = os.path.join(image_path, img_name)
        img = keras_preprocessing_image.load_img(img_path, target_size=target_size)
        yield keras.preprocessing.image.img_to_array(img)
        #x = np.expand_dims(x, axis=0)  # This is to make a single image 

def image_batch(batchsize=16):
    preprocesed_image_generator = preprocesed_image_gen()
    try:
        while True:
            arr = []
            for _ in range(batchsize):
                img = next(preprocesed_image_generator)
                print(img.shape)
                arr.append(img)
            yield preprocess_input( np.stack( arr, axis=0 ) )
    except StopIteration as e:  # Last set
        print("Last batch of %d elements" % ( len(arr),))
        #s = np.stack( arr, axis=0 )
        #p = preprocess_input( s )
        #print("s,p", s.shape, p.shape)
    finally:
        yield preprocess_input( np.stack( arr, axis=0 ) )


In [None]:
#preprocesed_image_generator = preprocesed_image_gen()
#next(preprocesed_image_generator)

In [None]:
if False:
    image_batcher = image_batch()
    batch = next(image_batcher)
    features = model.predict_on_batch(batch)
    features.shape

In [None]:
# This should do the batch creation on the CPU and the analysis on the GPU asynchronously.
features = model.predict_generator(image_batch, steps=len(img_arr))  #, verbose=1

In [None]:
features.shape

#    weight_count=[ float(np.sum([keras.backend.count_params(p) for p in set(w)]))/1000./1000. 
#                   for w in [m.trainable_weights, m.non_trainable_weights] ]


In [None]:
if False:
    x = image_to_input(model, preprocess_input_fn, img_path)

    batch = np.tile(x, (batch_size,1,1,1))
    
    t0 = time.time()
    for i in range(iters):
        _ = model.predict(batch,  batch_size=batch_size)
