# Description about data preprocessing  

Dataset (training, validation, and testing set) used in this experiment is based on the paper ["Order-embeddings Of Images and Language"](https://arxiv.org/pdf/1511.06361.pdf). 

Download data before preprocessing:

* Data with VGG19 features: [here](http://www.cs.toronto.edu/~vendrov/order/coco.zip)
* Caption and information from COCO site: [here](https://storage.googleapis.com/trl_data/coco_annotations_2014.zip)

Original source: [COCO website](http://cocodataset.org/#download)

In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100
import re
import json
import nltk
import string
from string import punctuation

DATA_PATH = 'data'
EMBEDDING_PATH = 'embeddings'
MODEL_PATH = 'models'

In [2]:
import _pickle as cPickle

# reading file in pickle format
def readPickle(pickleFilename):
    f = open(pickleFilename, 'rb')
    obj = cPickle.load(f)
    f.close()
    return obj

def savePickle(dataToWrite,pickleFilename):
    f = open(pickleFilename, 'wb')
    cPickle.dump(dataToWrite, f)
    f.close()

### 1. Preprocessed data (caption,images pairs) with VGG19 features

In [3]:
def load_dataset(path, cnn, fold=0):
    """
    Load captions and image features
    Possible options: coco
    """
    splits = ['train', 'test', 'dev']
    
    
    dataset = {}

    for split in splits:
        dataset[split] = {}
        caps = []
        splitName = 'val' if split == 'dev' else split
        with open('%s/%s.txt' % (path, splitName), 'rb') as f:
            for line in f:
                caps.append(line.strip())
            dataset[split]['caps'] = caps

        dataset[split]['ims'] = np.load('%s/images/%s/%s.npy' % (path, cnn, splitName))
        
       
        
        if split in ['train']:
            dataset[split]['ims'] = dataset[split]['ims'][fold*10000:(fold+1)*10000]
            dataset[split]['caps'] = dataset[split]['caps'][fold*50000:(fold+1)*50000]

        # handle coco specially by only taking 1k or 5k captions/images
        if split in ['dev', 'test']:
            dataset[split]['ims'] = dataset[split]['ims'][fold*1000:(fold+1)*1000]
            dataset[split]['caps'] = dataset[split]['caps'][fold*5000:(fold+1)*5000]
        

    return dataset

In [4]:
dataset = load_dataset('data', cnn='10crop')

In [27]:
train_data = dataset['train']

In [28]:
val_data = dataset['dev']

In [29]:
test_data = dataset['test']

In [None]:
# uncomment to save
# savePickle(train_data, os.path.join(DATA_PATH, 'train_data'))
# savePickle(val_data, os.path.join(DATA_PATH, 'val_data'))
# savePickle(test_data, os.path.join(DATA_PATH, 'test_data'))

### 2. Raw data from original source

### Instances information of validation set 

In [9]:
with open(os.path.join(DATA_PATH,'instances_val2014.json')) as json_file:
    coco_instances_val = json.load(json_file)

In [10]:
coco_instances_val.keys()

dict_keys(['licenses', 'annotations', 'info', 'categories', 'images'])

In [11]:
len(coco_instances_val['images'])

40504

In [12]:
len(coco_instances_val['categories'])

80

In [13]:
coco_instances_val['categories'][:10]

[{'id': 1, 'name': 'person', 'supercategory': 'person'},
 {'id': 2, 'name': 'bicycle', 'supercategory': 'vehicle'},
 {'id': 3, 'name': 'car', 'supercategory': 'vehicle'},
 {'id': 4, 'name': 'motorcycle', 'supercategory': 'vehicle'},
 {'id': 5, 'name': 'airplane', 'supercategory': 'vehicle'},
 {'id': 6, 'name': 'bus', 'supercategory': 'vehicle'},
 {'id': 7, 'name': 'train', 'supercategory': 'vehicle'},
 {'id': 8, 'name': 'truck', 'supercategory': 'vehicle'},
 {'id': 9, 'name': 'boat', 'supercategory': 'vehicle'},
 {'id': 10, 'name': 'traffic light', 'supercategory': 'outdoor'}]

### Caption information of validation set 

In [14]:
with open(os.path.join(DATA_PATH,'captions_val2014.json')) as json_file:
    coco_caption_val = json.load(json_file)

In [15]:
coco_caption_val.keys()

dict_keys(['licenses', 'annotations', 'info', 'images'])

In [16]:
len(coco_caption_val['images'])

40504

In [17]:
len(coco_caption_val['images']) * 5

202520

In [18]:
len(coco_caption_val['annotations'])

202654

In [19]:
coco_caption_val['images'][:1]

[{'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg',
  'date_captured': '2013-11-14 11:18:45',
  'file_name': 'COCO_val2014_000000391895.jpg',
  'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
  'height': 360,
  'id': 391895,
  'license': 3,
  'width': 640}]

In [20]:
coco_caption_val['annotations'][:5]

[{'caption': 'A bicycle replica with a clock as the front wheel.',
  'id': 37,
  'image_id': 203564},
 {'caption': 'A black Honda motorcycle parked in front of a garage.',
  'id': 38,
  'image_id': 179765},
 {'caption': 'A room with blue walls and a white sink and door.',
  'id': 49,
  'image_id': 322141},
 {'caption': 'A car that seems to be parked illegally behind a legally parked car',
  'id': 89,
  'image_id': 16977},
 {'caption': 'A large passenger airplane flying through the air.',
  'id': 98,
  'image_id': 106140}]

In [None]:
from keras.preprocessing import image
import matplotlib.pyplot as plt

In [None]:
filepath = os.path.join(DATA_PATH, 'sampled_images_coco/COCO_val2014_000000015260.jpg')

In [None]:
img = image.load_img(filepath, target_size=(224,224))

In [None]:
plt.imshow(img)
plt.axis("off")
plt.show()

### Storing information to make pairs of raw images and captions

In [21]:
from collections import OrderedDict

# dict format of (image-id, filename)
img_data = OrderedDict()
# dict format of (caption-id, caption-text)
cap_data = OrderedDict()

In [22]:
cap_imgs = []

In [23]:
cap_ids = set()
img_ids = set()

In [24]:
for i in range(len(coco_caption_val['images'])):
    #raw image data
    img_id = coco_caption_val['images'][i]['id']
    img_filename = coco_caption_val['images'][i]['file_name']
    img_data[img_id] = img_filename

for i in range(len(coco_caption_val['annotations'])):
    # caption text data
    cap_id = coco_caption_val['annotations'][i]['id']
    cap_ids.add(cap_id)
    
    cap_img_id = coco_caption_val['annotations'][i]['image_id']
    img_ids.add(cap_img_id)
    
    cap_text = coco_caption_val['annotations'][i]['caption']
    cap_data[cap_id] = cap_text
    
    cap_imgs.append((cap_id,cap_img_id))
    

In [25]:
img_caps = [(id2,id1) for (id1,id2) in cap_imgs]

In [None]:
# uncomment for storing files

# savePickle(img_data, os.path.join(DATA_PATH, 'img_val_data'))
# savePickle(cap_data, os.path.join(DATA_PATH, 'cap_val_data'))
# savePickle(cap_imgs, os.path.join(DATA_PATH, 'cap_imgs'))
# savePickle(img_caps, os.path.join(DATA_PATH, 'img_caps'))
# savePickle(cap_ids, os.path.join(DATA_PATH, 'cap_ids'))
# savePickle(img_ids, os.path.join(DATA_PATH, 'img_ids'))

### Compare raw set and preprocessed VGG19 data set

We only use validation set for our retrieval task

In [30]:
val_cap_dec = []
for capid, txt in enumerate(val_data['caps']):
    val_cap_dec.append((capid, txt.decode("utf-8")))

In [31]:
def find_element_in_list(element, list_element):
    try:
        index_element = list_element.index(element)
        return index_element
    except ValueError:
        return None

In [32]:
val_cap = np.array(val_cap_dec)

In [33]:
val_matched = []
for (capid, cap) in cap_data.items():
    txt = cap.lower().strip('.')
    list_val_caps = list(val_cap[:,1])
    idx_cap = find_element_in_list(txt,list_val_caps)
    val_matched.append((capid, idx_cap, txt))

In [None]:
# uncomment to store file
# savePickle(val_matched, os.path.join(DATA_PATH, 'val_matched'))

In [34]:
val_matched_data = []
for (id1,id2,txt) in val_matched:
    if id2 != None:
        val_matched_data.append((id1,id2,txt))

In [None]:
# uncomment to store file
# savePickle(val_matched_data, os.path.join(DATA_PATH, 'val_matched_data'))