## Flickr30k Captions to Corpus

*   P. Young, A. Lai, M. Hodosh, and J. Hockenmaier. _From image description to visual denotations: New similarity metrics for semantic inference over event descriptions._ Transactions of the Association for Computational Linguistics (to appear).

In [None]:
import os

import numpy as np

import datetime
t_start=datetime.datetime.now()

import pickle

In [None]:
data_path = './data/Flickr30k'
#image_path = os.path.join(data_path, 'flickr30k-images')
cache_dir = os.path.join(data_path, '../cache/Flickr30k')

#BATCHSIZE=16

#### Plan 

*  Have a look inside the captions ```flickr30k.tar.gz``` : includes ```results_20130124.token```
*  Extract contents of ```flickr30k.tar.gz``` to ```dict( photo_id -> [captions] )```
*  Filter out a subset of those ```photo_id``` to convert
*  Save off image array and corpus to an easy-to-load filetype

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:
img_to_captions=dict()

tarfilepath = os.path.join(data_path, 'flickr30k.tar.gz')
if os.path.isfile(tarfilepath):
    import tarfile
    with tarfile.open(tarfilepath, 'r:gz').extractfile('results_20130124.token') as tokenized:
        n_captions = 0
        for l in tokenized.readlines():
            #print(l)  # This is bytes
            img_num, caption = l.decode("utf-8").strip().split("\t")
            img, num = img_num.split("#")
            #print(img, caption); break
            if img not in img_to_captions:  img_to_captions[img]=[]
            img_to_captions[img].append(caption)
            n_captions += 1
            
print("Found %d images, with a total of %d captions" % (len(img_to_captions),n_captions, ))

In [None]:
good_img_to_captions, good_img_to_captions_title = img_to_captions, 'all'
len(good_img_to_captions)

In [None]:
# Filter for the images that we care about
import re
good_caption = re.compile( r'\b(cat|kitten)s?\b', flags=re.IGNORECASE )
good_img_to_captions = { img:captions
                            for img, captions in img_to_captions.items() 
                            for caption in captions 
                            if good_caption.search( caption )
                       }  # img=='3947306345.jpg'
good_img_to_captions_title = 'feline'
#good_img_to_captions
len(good_img_to_captions)

In [None]:
img_arr = sorted(good_img_to_captions.keys())

In [None]:
# extract the vocab
word_freq=dict()

for img in img_arr:
#for img in img_to_captions.keys():
    for caption in img_to_captions[img]:
        for w in caption.lower().split():
            if not w in word_freq: word_freq[w]=0
            word_freq[w] += 1
#word_freq
freq_word = sorted([ (f,w) for w,f in word_freq.items() if f>0], reverse=True)
len(freq_word), freq_word[0:20]

In [None]:
[ (w,f) for w,f in word_freq.items() if not w.isalpha() and '-' not in w and f>0 ]

In [None]:
stop_words = set ( stopwords.words('english') )
punc = set ("- . , : ; ' \" & $ % ( ) ! ?".split())

[ (w, w in stop_words) for w in "while with of at in".split() ]

In [None]:
stop_words_seen = set( word_freq.keys() ).intersection( stop_words.union(punc) )

', '.join(stop_words_seen)
len(stop_words_seen), len(stop_words)

In [None]:
# Save the data into a useful structure

np.random.seed(1)  # Consistent values for train/test (for this )
save_me = dict(
    img_arr = img_arr,
    img_to_captions = good_img_to_captions,
    train_test = np.random.random( (len(img_arr),) ),
    stop_words_seen = stop_words_seen,
)

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

with open( os.path.join(cache_dir, 'subset_%s_%s.pkl' % ( 
            t_start.strftime("%Y-%m-%d_%H-%M"), good_img_to_captions_title,)), 
          'wb') as f:
    pickle.dump(save_me, f)
    
print("Subset saved")