In [1]:
import json
from collections import Counter
import itertools

import config
import data
import utils

In [2]:
def prepare_questions(questions_json):
    """ Tokenize and normalize questions from a given question json in the usual VQA format. """
    questions = [q['question'] for q in questions_json['questions']]
    for question in questions:
        question = question.lower()[:-1]
        yield question.split(' ')

def extract_vocab(iterable, top_k=None, start=0):
    """ Turns an iterable of list of tokens into a vocabulary.
        These tokens could be single answers or word tokens in questions.
    """
    all_tokens = itertools.chain.from_iterable(iterable)
    counter = Counter(all_tokens)
    if top_k:
        most_common = counter.most_common(top_k)
        most_common = (t for t, c in most_common)
    else:
        most_common = counter.keys()
    # descending in count, then lexicographical order
    tokens = sorted(most_common, key=lambda x: (counter[x], x), reverse=True)
    vocab = {t: i for i, t in enumerate(tokens, start=start)}
    return vocab


def main():
    questions = "data/v2_OpenEnded_mscoco_val2014_questions.json"
#     answers = "v2_mscoco_train2014_annotations.json"

    with open(questions, 'r') as fd:
        questions = json.load(fd)
#     with open(answers, 'r') as fd:
#         answers = json.load(fd)

    questions = prepare_questions(questions)
#     answers = data.prepare_answers(answers)

    question_vocab = extract_vocab(questions, start=1)
#     answer_vocab = extract_vocab(answers, top_k=config.max_answers)

#     vocabs = {
#         'question': question_vocab,
#         'answer': answer_vocab,
#     }
    with open("data/cache/question_vocab_val", 'w') as fd:
        json.dump(question_vocab, fd)

In [3]:
if __name__ == '__main__':
    main()