In [1]:
from os import walk

# load the train images and their relative paths
coco_train_path = "../data/coco/train2014/"
coco_train = next(walk(coco_train_path), (None, None, []))[2]  # gives [] if no file
coco_train = list(map(lambda orig_string: coco_train_path + orig_string, coco_train)) # add relative path to each file name

# load the val images and their relative paths
coco_val_path = "../data/coco/val2014/"
coco_val = next(walk(coco_val_path), (None, None, []))[2]  # gives [] if no file
coco_val = list(map(lambda orig_string: coco_val_path + orig_string, coco_val)) # add relative path to each file name

In [2]:
'''For remapping the image names in visdial1.0 train json to the relative paths of the images for my own convenience'''
import json
from tqdm import tqdm
from time import sleep

file = json.load(open('../data/v1.0/visdial_1.0_train.json'))

'''Remap the images to their relative paths'''
for dialog in tqdm(file['data']['dialogs']):  
    # stores the relative of each image
    # coco_train + coco_val because some images are from train and some from val
    file['data']['dialogs'][file['data']['dialogs'].index(dialog)]['image_id'] = [image_id for image_id in coco_train+coco_val if str(dialog['image_id']) in image_id][0]

# size of the data subset
file['volume'] = 1.0 # 100 percent
file['data']['questions'] = [s + '?' for s in file['data']['questions']] # add question mark to each question for GoG format

# save the 100% data with relative image paths
with open('../data/subsets/visdial_1.0_train_100percent_subset.json', 'w') as outfile:
    json.dump(file, outfile)
# save the questions to a file
with open('../data/subsets/visdial_1.0_train_100percent_subset_questions.txt', mode='wt', encoding='utf-8') as outfile:
    outfile.write('\n'.join(file['data']['questions']))

100%|██████████| 123287/123287 [1:30:19<00:00, 22.75it/s]


In [1]:
'''For creating the data subsets'''
import json

data = json.load(open('../data/v1.0/visdial_1.0_train.json'))['data'] # dict_keys(['dialogs', 'answers', 'questions'])
dialogs = data['dialogs']
answers = data['answers']
questions = data['questions']

In [3]:
import random

random.seed(42)  # for reproducibility

'''Change the usage to whatever percentage of the data you want to work upon'''
usage = 0.1  # 10% of the data
subset = random.sample(dialogs, round(usage*len(dialogs)))  # sample 1% of the data

subset_questions = []
subset_answers = []

for dialog in subset:
    for dialog_round in dialog['dialog']:
        subset_questions.append(questions[dialog_round['question']])
        subset_answers.append(answers[dialog_round['answer']])
        subset_answers += map(answers.__getitem__,dialog_round['answer_options'])

subset_questions = list(set(subset_questions))
subset_answers = list(set(subset_answers))

In [4]:
from tqdm import tqdm
from time import sleep

captions = [d['caption'] for d in subset]  # list of captions

'''Remap the question, answer, and answer_options indices to the new subset indices
Note: Ground truth answers are not remapped, since it indicates the relative order of the ground truth answer from the answer_options for each dialog round'''
for dialog in tqdm(subset):  # dict_keys(['image_id', 'caption', 'dialog'])
    # stores the relative of each image
    # coco_train + coco_val because some images are from train and some from val
    image_id = [s for s in coco_train +
                coco_val if str(dialog['image_id']) in s][0]
    subset[subset.index(dialog)]['image_id'] = image_id

    # for each dialog round
    for dialog_round in dialog['dialog']:
        # remap the answer index to the new subset index
        old_answer_value = answers[dialog_round['answer']]
        subset[subset.index(dialog)]['dialog'][dialog['dialog'].index(
            dialog_round)]['answer'] = subset_answers.index(old_answer_value)

        # remap the question index to the new subset index
        old_question_value = questions[dialog_round['question']]
        subset[subset.index(dialog)]['dialog'][dialog['dialog'].index(
            dialog_round)]['question'] = subset_questions.index(old_question_value)

        # remap the answer_options indices to the new subset indices
        old_answer_options_values = list(
            map(answers.__getitem__, dialog_round['answer_options']))
        subset[subset.index(dialog)]['dialog'][dialog['dialog'].index(dialog_round)]['answer_options'] = [
            subset_answers.index(a) for a in old_answer_options_values]

        # nothing to do for the ground truth as indicated above

100%|██████████| 12329/12329 [28:00:25<00:00,  8.18s/it]  


In [17]:
# formatting it like the VisDial v1.0 data
subset_data = {'version': 1.0, 'split': 'train subset', 'volume': usage, 'data': {
    'dialogs': subset, 'questions': subset_questions, 'answers': subset_answers}}

# save the subset_data
with open('../data/subsets/visdial_1.0_train_10percent_subset.json', 'w') as outfile:
    json.dump(subset_data, outfile)

# save the questions in the form of the GoG Paper
questions = [question + '?' for question in subset_data['data']['questions']]
# save the subset_data
with open('../data/subsets/visdial_1.0_train_10percent_subset_questions.txt', mode='wt', encoding='utf-8') as outfile:
    outfile.write('\n'.join(questions))


In [18]:
# load the original and the two subsets
hundred_percent = json.load(open('../data/subsets/visdial_1.0_train_100percent_subset.json'))['data']
one_percent = json.load(open('../data/subsets/visdial_1.0_train_1percent_subset.json'))['data']
ten_percent = json.load(open('../data/subsets/visdial_1.0_train_10percent_subset.json'))['data']

In [31]:
import pandas as pd
print("Comparison of the whole dataset vs the new ones: ")
comparison = {'dataset size:': [len(hundred_percent['dialogs']), len(ten_percent['dialogs']), len(one_percent['dialogs'])],
              'Unique images:': [len(hundred_percent['dialogs']), len(ten_percent['dialogs']), len(one_percent['dialogs'])],
              'Unique questions': [len(hundred_percent['questions']), len(ten_percent['questions']), len(one_percent['questions'])],
              'Unique answers': [len(hundred_percent['answers']), len(ten_percent['answers']), len(one_percent['answers'])]}
# Note that all dialogs have a unique image
comparison = pd.DataFrame(data=comparison, index=['VisDial1.0 Train', '10 percent VisDial1.0 Train', '1 percent VisDial1.0 Train'])
comparison


Comparison of the whole dataset vs the new ones: 


Unnamed: 0,dataset size:,Unique images:,Unique questions,Unique answers
VisDial1.0 Train,123287,123287,376082,337527
10 percent VisDial1.0 Train,12329,12329,58069,337517
1 percent VisDial1.0 Train,1233,1233,8402,268020


In [4]:
'''Preparing the subsets for davidnvq's visdial code, i.e. making the subset exactly the same as the original VisDial v1.0 data'''
import json

# all = json.load(open('../data/v1.0/visdial_1.0_train.json'))
subset = json.load(open('../data/subsets/visdial_1.0_train_1percent_subset.json'))

# delete the last 4 characters of the image_id (.jpg extension) and then extract the last 6 characters
# subset['data']['dialogs'][0]['image_id'] = int(subset['data']['dialogs'][0]['image_id'][:-4][-6:])

# do this for all dialogs
for dialog in subset['data']['dialogs']:
    dialog['image_id'] = int(dialog['image_id'][:-4][-6:])

# save the subset_data
with open('visdial_1.0_train.json', 'w') as outfile:
    json.dump(subset, outfile)