In [1]:
import json

# set the data subset to use
subset = 1
# load data
data = json.load(open('../data/subsets/visdial_1.0_train_' +
                 str(subset) + 'percent_subset.json'))['data']
# load answers
answers = data['answers']
answers[:5]

['both surfboards are white',
 'there is some sort of awning in the foreground',
 'i think so, kind of hard to see',
 "there is 1 man and 1 woman an then another man's arm",
 'i think so 1 eye is facing the camera']

In [2]:
'''Loading Glove'''
import pandas as pd


def load_glove_model(glove_file):
    readable_name = "-".join(glove_file.rstrip('.txt').split('/')
                             [-1].split(".")).capitalize()
    print(f"Loading {readable_name} Model:")
    df = pd.read_csv(glove_file, sep=" ", quoting=3, header=None, index_col=0)
    glove_model = {key: val.values for key, val in df.T.items()}
    print(
        f"Done. {readable_name} with a vocabulary of {len(glove_model)} words was loaded!")
    return glove_model


glove_6b_300 = load_glove_model('../embeddings/glove/glove.6B.300d.txt')

Loading Glove-6b-300d Model:
Done. Glove-6b-300d with a vocabulary of 399998 words was loaded!


In [8]:
'''Tokenize a sentence using a glove model, pass it through an LSTM and return the hidden state of each word'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import nltk
import numpy as np


def sentence_to_hidden_state(sentence, glove_model, hidden_dim, batch_size):
    # tokenize the sentence: words, punctuations are individual tokens
    tokenized_sentence = nltk.word_tokenize(sentence)
    # get the glove vectors for each word
    glove_vectors = []
    for word in tokenized_sentence:
        if word in glove_model:
            glove_vectors.append(glove_model[word])
        else:
            glove_vectors.append(glove_model['unk'])
    # create the input tensor
    input_tensor = torch.FloatTensor(np.array(glove_vectors))
    # pad the input tensor to 20 words if it is shorter, or truncate it if it is longer
    if len(tokenized_sentence) < 20:
        input_tensor = F.pad(input_tensor, (0, 0, 0, 20-len(tokenized_sentence)))
    else:
        input_tensor = input_tensor[:20]
    # create the LSTM
    lstm = nn.LSTM(300, hidden_dim)
    # pass the input tensor through the LSTM and save the hidden state of each word
    hidden_states = []
    for i in range(len(tokenized_sentence)):
        hidden_state, _ = lstm(input_tensor[:i+1].unsqueeze(1))
        hidden_states.append(hidden_state[-1].detach().numpy().squeeze())
    hidden_states = np.array(hidden_states)
    # pass the hidden states of the sentence containing n words through a linear layer to get the final hidden state of 1*512
    hidden_dim = hidden_states.shape[1]
    linear = nn.Linear(hidden_dim, 512)
    hidden_states = linear(torch.FloatTensor(hidden_states)).detach().numpy()
    # max pool the hidden states to get the final hidden state of 1*512
    hidden_states = np.max(hidden_states, axis=0)
    return hidden_states

'''Test the function'''
sentence = "This is a test sentence?"
hidden_states = sentence_to_hidden_state(sentence, glove_6b_300, 300, 1)
hidden_states.shape

(512,)

In [9]:
from tqdm import tqdm
import pickle

# create a pickle file to store the hidden states of the answers
with open('../embeddings/answers/' + str(subset) + '/answer_embeddings.pkl', 'wb') as f:
    for answer in tqdm(answers):
        hidden_states = sentence_to_hidden_state(answer, glove_6b_300, 300, 1)
        pickle.dump(hidden_states, f)


100%|██████████| 268020/268020 [43:05<00:00, 103.67it/s]


In [10]:
'''Load the hidden states of the answers'''
import pickle

subset = 1
# load the hidden states of the answers
with open('../embeddings/answers/' + str(subset) + '/answer_embeddings.pkl', 'rb') as f:
    answer_embeddings = []
    while True:
        try:
            answer_embeddings.append(pickle.load(f))
        except EOFError:
            break

In [11]:
import numpy as np

answer_embeddings = np.array(answer_embeddings)
answer_embeddings.shape

(268020, 512)