# This notebooks presents how the features are extracted from the questions using a word2vec model to project the tokens into a dimension 300 space

In [15]:
import spacy
import numpy as np
import json
import pickle
import os

## Load the word2vec model

In [2]:
word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')

## Function to extract the features from the question

In [3]:
def get_question_features(question, embedding):
    ''' For a given question, a unicode string, returns the timeseris vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    tokens = embedding(question)
    question_tensor = np.zeros((1, 30, 300))
    for j in xrange(len(tokens)):
            question_tensor[0,j,:] = tokens[j].vector
    return question_tensor

## Load the questions

In [4]:
questions_path = '../../data/OpenEnded_mscoco_val2014_questions.json'

In [5]:
with open(questions_path) as data_file:    
    data = json.load(data_file)

In [6]:
questions = data['questions']

## Extract features from the questions and save them in the hard disk

In [7]:
from tqdm import tqdm_notebook

In [8]:
save_path = '../../data/preprocessed_questions/'

In [14]:
for question in tqdm_notebook(questions):
    out_name = str(question['question_id'])+'.npy'
    res = get_question_features(question[u'question'], word_embeddings)
    np.save(open(save_path + out_name, "wb"), res)


