In [1]:
'''Load the three main flavours of GloVe models'''
import pandas as pd

def load_glove_model(glove_file):
    readable_name = "-".join(glove_file.rstrip('.txt').split('/')[-1].split(".")).capitalize()
    print(f"Loading {readable_name} Model:")
    df = pd.read_csv(glove_file, sep=" ", quoting=3, header=None, index_col=0)
    glove_model = {key: val.values for key, val in df.T.items()}
    print(f"Done. {readable_name} with a vocabulary of {len(glove_model)} words was loaded!")
    return glove_model

glove_6b_300 = load_glove_model('../embeddings/glove/glove.6B.300d.txt')
glove_twitter_27b_200 = load_glove_model('../embeddings/glove/glove.twitter.27B.200d.txt')
glove_840b_300 = load_glove_model('../embeddings/glove/glove.840B.300d.txt')

Loading Glove-6b-300d Model:
Done. Glove-6b-300d with a vocabulary of 399998 words was loaded!
Loading Glove-twitter-27b-200d Model:
Done. Glove-twitter-27b-200d with a vocabulary of 1193513 words was loaded!
Loading Glove-840b-300d Model:
Done. Glove-840b-300d with a vocabulary of 2196009 words was loaded!


In [29]:
'''Checking the Out-of-Vocabulary (OOV) words for the Questions'''
import itertools
from nltk.tokenize import RegexpTokenizer

# read text file
with open('../data/visdial_1.0_train_questions.txt') as file:
    questions = [line.rstrip() for line in file]

# remove punctuations
tokenizer = RegexpTokenizer(r'\w+')  # to remove punctuations
tokenized_questions = [tokenizer.tokenize(question.lower()) for question in questions]

# checking if the words in the vocab are present in the glove model
question_vocab = set(list(itertools.chain.from_iterable(tokenized_questions)))
q_oov_glove_6b_300 = [word for word in question_vocab if word not in glove_6b_300.keys()]
q_oov_glove_twitter_27b_200 = [word for word in question_vocab if word not in glove_twitter_27b_200.keys()]
q_oov_glove_840b_300 = [word for word in question_vocab if word not in glove_840b_300.keys()]

print(f"Number of words in the vocab: {len(question_vocab)}")
print(f"Number of words in the vocab not present in glove.6B.300d.txt: {len(q_oov_glove_6b_300)}")
print(f"Number of words in the vocab not present in glove.twitter.27B.200d.txt: {len(q_oov_glove_twitter_27b_200)}")
print(f"Number of words in the vocab not present in glove.840B.300d.txt: {len(q_oov_glove_840b_300)}")

Number of words in the vocab: 15697
Number of words in the vocab not present in glove.6B.300d.txt: 487
Number of words in the vocab not present in glove.twitter.27B.200d.txt: 805
Number of words in the vocab not present in glove.840B.300d.txt: 161


In [30]:
'''Checking the Out-of-Vocabulary (OOV) words for the History'''
import json
from nltk.tokenize import RegexpTokenizer
import itertools

'''Load the history'''
with open("../embeddings/history/100/history.json", 'r') as f:
    history = json.load(f)

# tokenize the history
tokenizer = RegexpTokenizer(r'\w+')  # to remove punctuations
history_flatlist = list(itertools.chain.from_iterable(history))
tokenized_history = [tokenizer.tokenize(sentence.lower()) for sentence in history_flatlist]

# checking if the words in the vocab are present in the glove model
history_vocab = set(list(itertools.chain.from_iterable(tokenized_history)))
h_oov_glove_6b_300 = [word for word in history_vocab if word not in glove_6b_300.keys()]
h_oov_glove_twitter_27b_200 = [word for word in history_vocab if word not in glove_twitter_27b_200.keys()]
h_oov_glove_840b_300 = [word for word in history_vocab if word not in glove_840b_300.keys()]

print(f"Number of words in the vocab: {len(history_vocab)}")
print(f"Number of words in the vocab not present in glove.6B.300d.txt: {len(h_oov_glove_6b_300)}")
print(f"Number of words in the vocab not present in glove.twitter.27B.200d.txt: {len(h_oov_glove_twitter_27b_200)}")
print(f"Number of words in the vocab not present in glove.840B.300d.txt: {len(h_oov_glove_840b_300)}")


Number of words in the vocab: 28280
Number of words in the vocab not present in glove.6B.300d.txt: 3366
Number of words in the vocab not present in glove.twitter.27B.200d.txt: 4336
Number of words in the vocab not present in glove.840B.300d.txt: 1863


In [55]:
'''Tokenize a sentence using a glove model, pass it through an LSTm and return the last LSTM hidden state'''
import torch
import torch.nn as nn
import torch.nn.functional as F

def sentence_LSTM(glove_model, sentence):
    # tokenize the sentence
    tokenizer = RegexpTokenizer(r'\w+')  # to remove punctuations
    tokenized_sentence = tokenizer.tokenize(sentence.lower())
    
    # get the glove embedding for each word in the sentence
    glove_embeddings = []
    for word in tokenized_sentence:
        if word in glove_model.keys():
            glove_embeddings.append(glove_model[word])
        else:
            glove_embeddings.append(glove_model['unk'])

    # convert the glove embeddings to a tensor
    glove_embeddings = torch.tensor(glove_embeddings).float()

    # pass the glove embeddings through an LSTM
    # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
    lstm = nn.LSTM(input_size=300, hidden_size=300, num_layers=1, batch_first=True)
    lstm_out, (h_n, c_n) = lstm(glove_embeddings.view(1, len(tokenized_sentence), 300))
    
    # return the last hidden state
    return h_n.squeeze(0)

'''Testing LSTM hidden state output'''
sentence_LSTM(glove_6b_300, "What is the color of the shirt?").shape

torch.Size([1, 300])

: 

In [None]:
'''e2e Coreference Resolution from the paper Higher-order Coreference Resolution with Coarse-to-fine Inference by Li et al. (2018)'''

