In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from load_glove import Embedding, Glove, LoadPretrainedGlove
from models import SkipGram, CBOW, NN

In [2]:
# LoadPretrainedGlove was implemented to load the pre-trained Glove embeddings
# the default was set to use the 100d embedding model, model has about 1.2m embeddings
# can pass in a filepath for other embedding dimensions to load other pre-trained embeddings
# Glove has 25d, 50d, 100d, and 200d online. Note that 200d takes a long time to load (27 billions * 200 total floats)
# LoadPretrainedGlove returns a Glove object
EMBEDDING_DIM = 100
glove_model = LoadPretrainedGlove() # use default 100d model

In [3]:
glove_model

<Glove_object num_tokens.1193517 vec_dim.100>

In [4]:
# test functionality of the model (can skip), so far I only implemented five methods
# get_vector will get the embeddings of a token (Glove made all tokens lower cases)
glove_model.get_vector('Michael')

'Michael' not in the model


In [5]:
# test functionality of the model (can skip), so far I only implemented five methods
# similarity will retrieve the embeddings and compute the consine similarity between two given tokens
glove_model.similarity('excellent', 'great')

0.7804909766934288

In [6]:
# test functionality of the model (can skip), so far I only implemented five methods
# most_similar can be used to map the word embedding back to the most similar word in the model using cosine
glove_model.most_similar('excellent')

[('fantastic', 0.7891919692639917),
 ('great', 0.7804909766934288),
 ('brilliant', 0.7707372695455804),
 ('superb', 0.7450593638693023),
 ('outstanding', 0.7220487414581103),
 ('terrific', 0.7189632766496982),
 ('unique', 0.713248325184925),
 ('article', 0.7111885947728657),
 ('very', 0.7083041440709903),
 ('interesting', 0.7079968065670241)]

In [7]:
# test functionality of the model (can skip), so far I only implemented five methods
# similarity_embedding can be used to compute the similarity between two input embeddings
# it basically just compute the consine similarity between two vectors
glove_model.similarity_embedding([1, 0, 1], [1, 0, 1])

0.9999999999999998

In [8]:
# test functionality of the model (can skip), so far I only implemented five methods
# most_similar_token find the most similar words with the input word embedding
# by default it returns top 10 most similar words and the similarity with the input embedding
# can change top_n to any number
glove_model.most_similar_token(glove_model.get_vector('excellent'))

[('fantastic', 0.7891919692639917),
 ('great', 0.7804909766934288),
 ('brilliant', 0.7707372695455804),
 ('superb', 0.7450593638693023),
 ('outstanding', 0.7220487414581103),
 ('terrific', 0.7189632766496982),
 ('unique', 0.713248325184925),
 ('article', 0.7111885947728657),
 ('very', 0.7083041440709903),
 ('interesting', 0.7079968065670241)]

In [86]:
# load test set, here all data from original was used
# the original set has even more words that are not in the model -- a lot of them were misspelled

token_to_embedding_test = {}
not_pretrained = 0

with open('tokenized.spelling.txt', 'r', encoding='utf-8') as textfile:
    line = textfile.readline()
    while line:
        line = line.strip('\n')
        line = line.split()
        for token in line:
            # for each unique token appeared in the training data, get a pretrained embedding 
            token = token.lower() # since Glove only encoded lower case embeddings
            if token not in token_to_embedding_test.keys():
                pretrained = glove_model.get_vector(token)
                if pretrained == None:
                    # only get the unkonwn words and their dependencies
                    not_pretrained += 1
                token_to_embedding_test[token.lower()] = pretrained
        line = textfile.readline()
        
vocab_size = len(token_to_embedding_test)
print('{} unknown tokens that are not in the model'.format(not_pretrained))
print('{} sentences with unkonwn tokens loaded'.format(vocab_size))


'forne' not in the model
'ennoying' not in the model
'sinccerly' not in the model
'pressume' not in the model
'fhing' not in the model
'sathurday' not in the model
'troubbles' not in the model
'programms' not in the model
'organying' not in the model
'concers' not in the model
'caracterize' not in the model
'reasonably-piced' not in the model
'accomodations' not in the model
'emprove' not in the model
'-lrb-' not in the model
'-rrb-' not in the model
'sandwich-man' not in the model
'well-orginised' not in the model
'wezy' not in the model
'apresented' not in the model
'stily' not in the model
'arsts' not in the model
'21-22' not in the model
'reasonaly-priced' not in the model
'recommand' not in the model
'appliaces' not in the model
'whed' not in the model
'visitores' not in the model
'intrestng' not in the model
'promontion' not in the model
'languagers' not in the model
'2' not in the model
'2000' not in the model
'fashonable' not in the model
'mentionned' not in the model
'studenst

'marvellouse' not in the model
'architerture' not in the model
'xviiith' not in the model
'shutel' not in the model
'cotages' not in the model
'romanics' not in the model
'acomodation' not in the model
'resart' not in the model
'tecnonology' not in the model
'arquitector' not in the model
'lugguage' not in the model
'1948' not in the model
'demaged' not in the model
'footh' not in the model
'eye-max' not in the model
'garelty' not in the model
'locted' not in the model
'historcall' not in the model
'picthurs' not in the model
'reseption' not in the model
'wellkown' not in the model
'expectetions' not in the model
'hystorical' not in the model
'now-a-day' not in the model
'washmachines' not in the model
'tradional' not in the model
'inculds' not in the model
'labrary' not in the model
'picadelly' not in the model
'vistit' not in the model
'city-centre' not in the model
'surposed' not in the model
'hestitate' not in the model
'estart' not in the model
'procesing' not in the model
'perfor

In [90]:
# making test input
TEST_SIZE = 200
dependency_listing_test = []
max_length = 15
not_selected = 0

def TokensToEmbeddings(token_list):
    output_list = []
    for token in token_list:
        try:
            embedding = glove_model.get_vector(token)
            if embedding == None:
                embedding = [0 for i in range(EMBEDDING_DIM)]
        except KeyError:
            embedding = [0 for i in range(EMBEDDING_DIM)]
        output_list.extend(embedding)
    return output_list

def TokenToEmbedding(token, token_to_embedding):
    try:
        embedding = token_to_embedding[token]
    except KeyError:
        embedding = None
    return embedding

with open('dependencies.spelling.txt', 'r', encoding='utf-8') as textfile:
    metadata = json.load(textfile)
    for sentence in metadata:
        sentence_dependencies = {}
        for dep_relation in sentence:
            if dep_relation['dep'] == 'ROOT':
                continue # skip the root token as it doesn't have any dependency
            # word to predict is the governor, add all dependencies into the list of governor
            governor = dep_relation['governorGloss'].lower()
            dependent = dep_relation['dependentGloss'].lower()
            if governor not in sentence_dependencies.keys():
                sentence_dependencies[governor] = []
            sentence_dependencies[governor].append(dependent)
        for key, value in sentence_dependencies.items():
            if len(value) > 15:
                not_selected += 1
                continue
            dependency_listing_test.append([key, value])

print('{} / {} words having more than 15 dependencies not selected'.format(not_selected, len(dependency_listing_test) + not_selected))


11 / 16112 words having more than 15 dependencies not selected


In [94]:
# get the unknown words to be label and its dependencies to be input
# test only 200 testing samples, as the computational cost for mapping the unknown word embedding to a known words is huge
# for each predicted word embedding, it has to compute the cosine similarity of that embedding with all 1.2M embeddings
# in the oroginal model then will find the most similar top n words to be the most similar known words

test_input = [[0, 0, 0] for i in range(TEST_SIZE)]
counter = 0

for (target, data) in dependency_listing_test:
    label = TokenToEmbedding(target, token_to_embedding_test)
    if label == None:
        counter += 1
        test_input[counter - 1][1] = torch.zeros(EMBEDDING_DIM)
        test_input[counter - 1][2] = target

        formatted_input = []
        embeddings = TokensToEmbeddings(data)
        formatted_input.extend(embeddings)
        if len(data) < max_length:
            # need padding if there are not enough dependencies
            padding_length = (max_length - len(data)) * EMBEDDING_DIM
            formatted_input.extend([0 for i in range(padding_length)])
        test_input[counter - 1][0] = torch.FloatTensor(formatted_input)
    if counter == TEST_SIZE:
        break

'apresented' not in the model
'21' not in the model
'22' not in the model
'22' not in the model
'forthy' not in the model
'disadvantatges' not in the model
'45' not in the model
'20:15' not in the model
'...' not in the model
'sciencetist' not in the model
'diseasters' not in the model
'19:30' not in the model
'00.15' not in the model
'3' not in the model


In [96]:
print(len(test_input)) # num_batches
print(len(test_input[0])) # input, label (vector), label (text)
print(test_input[0][0].shape) # batch shape

200
3
torch.Size([1500])


In [None]:
# test the best model (relu no softmax)
model_nn = torch.load('./nn.nosoftmax.best.model')
model_nn.eval()
with torch.no_grad():
    for data, target, text_label in test_input:
        output = model_nn(data)
        # for each taget embedding, see if the most similar word is the same with actual
        # if the actual is in top 5 of the most similar word with the predicted embedding
        # count correct + 1
        top5_most_similar = glove_model.most_similar_token(output, topn=5)
        print('current word: {}'.format(text_label))
        print('   top five prediction: {}'.format(top5_most_similar))


current word: forne
   top five prediction: [('our', 0.8568243573257759), ('friends', 0.8440370482460278), ('family', 0.8414293473396705), ('there', 0.8398773558643973), ('other', 0.8312588289647292)]
current word: forne
   top five prediction: [('there', 0.8817443123554898), ('many', 0.8792584528454666), ('things', 0.8757930090643414), ('how', 0.86911960144686), ('any', 0.8688549018653574)]
current word: ennoying
   top five prediction: [('there', 0.9492946888835155), ('think', 0.9449170534802956), ('but', 0.9401724706773426), ("n't", 0.9376505046196539), ('it', 0.9327837944200056)]
current word: sinccerly
   top five prediction: [('first', 0.8447360751970565), ('other', 0.8408892264593248), ('about', 0.8395984650449209), ('any', 0.8379450984628121), ('thing', 0.8361936869863933)]
current word: pressume
   top five prediction: [('know', 0.9463690292789775), ('there', 0.9416180443622981), ("n't", 0.9381204236278791), ('but', 0.9356723554041914), ('how', 0.9292456471990407)]
current wor

current word: fesival
   top five prediction: [('expo', 0.7865919346107768), ('fest', 0.7774885518620084), ('showcase', 0.7601863731697974), ('event', 0.75012738717278), ('tour', 0.7452026108773686)]
current word: detailes
   top five prediction: [('people', 0.8790677061991875), ('things', 0.8763441092536352), ('many', 0.8746198773268924), ('any', 0.8539775093701334), ('there', 0.8453859817760887)]
current word: pursuading
   top five prediction: [('concerning', 0.777562884990666), ('aware', 0.7609631620178388), ('considering', 0.7587539566341045), ('consider', 0.7512486055086007), ('concerned', 0.751137153711064)]
current word: organiced
   top five prediction: [('have', 0.940886903574838), ('there', 0.9395822731280595), ('know', 0.939235694211777), ('think', 0.9367113009408339), ('that', 0.9317741311078582)]
current word: 21
   top five prediction: [('days', 0.8951540644663668), ('there', 0.8917368451891737), ('this', 0.8902383846434773), ('today', 0.8886763163721033), ('way', 0.8819

current word: mr/mss
   top five prediction: [('mr.', 0.9332634928018948), ('ms.', 0.8936527717639708), ('mrs', 0.8919668644172073), ('mr', 0.8309164534707854), ('aka', 0.7738372669234178)]
current word: 20:15
   top five prediction: [('there', 0.9026019084210105), ('this', 0.892001057246865), ('today', 0.8893647866104889), ('next', 0.8872676874517902), ('out', 0.8845742423794809)]
current word: 19:30
   top five prediction: [('there', 0.9026017888067287), ('this', 0.8920009409395652), ('today', 0.8893646707760403), ('next', 0.8872675720429942), ('out', 0.8845741267146489)]
current word: techology
   top five prediction: [('tech', 0.8512587995254516), ('innovation', 0.8005316780337546), ('systems', 0.796521559531711), ('development', 0.7856569413529388), ('enterprise', 0.7715178283049704)]
current word: centure
   top five prediction: [('there', 0.8929675510680202), ('the', 0.890793535765776), ('first', 0.8881654907946891), ('next', 0.8821681822112243), ('out', 0.8756859792658557)]
cur