In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from load_glove import Embedding, Glove, LoadPretrainedGlove
from models import SkipGram, CBOW, NN

In [2]:
# LoadPretrainedGlove was implemented to load the 27B pre-trained Glove embeddings
# the default was set to use the 100d embedding model
# can pass in a filepath for other embedding dimensions to load other pre-trained embeddings
# Glove 27B has 25d, 50d, 100d, and 200d online. Note that 200d takes a long time to load (27 billions * 200 total floats)
# LoadPretrainedGlove returns a Glove object
EMBEDDING_DIM = 100
glove_model = LoadPretrainedGlove() # use default 100d model

In [3]:
# test functionality of the model (can skip), so far I only implemented five methods
# get_vector will get the embeddings of a token (Glove made all tokens lower cases)
glove_model.get_vector('Michael')

'Michael' not in the model


In [4]:
# test functionality of the model (can skip), so far I only implemented five methods
# similarity will retrieve the embeddings and compute the consine similarity between two given tokens
glove_model.similarity('excellent', 'great')

0.7804909766934288

In [5]:
# test functionality of the model (can skip), so far I only implemented five methods
# most_similar can be used to map the word embedding back to the most similar word in the model using cosine
glove_model.most_similar('excellent')

[('fantastic', 0.7891919692639917),
 ('great', 0.7804909766934288),
 ('brilliant', 0.7707372695455804),
 ('superb', 0.7450593638693023),
 ('outstanding', 0.7220487414581103),
 ('terrific', 0.7189632766496982),
 ('unique', 0.713248325184925),
 ('article', 0.7111885947728657),
 ('very', 0.7083041440709903),
 ('interesting', 0.7079968065670241)]

In [6]:
# test functionality of the model (can skip), so far I only implemented five methods
# similarity_embedding can be used to compute the similarity between two input embeddings
# it basically just compute the consine similarity between two vectors
glove_model.similarity_embedding([1, 0, 1], [1, 0, 1])

0.9999999999999998

In [7]:
# test functionality of the model (can skip), so far I only implemented five methods
# most_similar_token find the most similar words with the input word embedding
# by default it returns top 10 most similar words and the similarity with the input embedding
# can change top_n to any number
glove_model.most_similar_token(glove_model.get_vector('excellent'))

[('fantastic', 0.7891919692639917),
 ('great', 0.7804909766934288),
 ('brilliant', 0.7707372695455804),
 ('superb', 0.7450593638693023),
 ('outstanding', 0.7220487414581103),
 ('terrific', 0.7189632766496982),
 ('unique', 0.713248325184925),
 ('article', 0.7111885947728657),
 ('very', 0.7083041440709903),
 ('interesting', 0.7079968065670241)]

In [8]:
# load the tokenized sentences from file, create the dictionary for embeddings look up
token_to_embedding = {}
not_pretrained = 0

with open('tokenized.corrected.txt', 'r', encoding='utf-8') as textfile:
    line = textfile.readline()
    while line:
        line = line.strip('\n')
        line = line.split()
        for token in line:
            # for each unique token appeared in the training data, get a pretrained embedding 
            token = token.lower() # since Glove only encoded lower case embeddings
            if token not in token_to_embedding.keys():
                pretrained = glove_model.get_vector(token)
                if pretrained == None:
                    not_pretrained += 1
                token_to_embedding[token.lower()] = pretrained
        line = textfile.readline()
        
vocab_size = len(token_to_embedding)
print('{} / {} tokens not in the model'.format(not_pretrained, vocab_size))

'...' not in the model
'16.12.00' not in the model
'all-events' not in the model
'16th' not in the model
'2000' not in the model
'-lrb-' not in the model
'-rrb-' not in the model
's/he' not in the model
'24' not in the model
'1' not in the model
'2' not in the model
'3' not in the model
'just-so-called' not in the model
'11' not in the model
'characterizes' not in the model
'ôinternational' not in the model
'festivalö' not in the model
'ôseveralö' not in the model
'sandwich-man' not in the model
'ôsmallö' not in the model
'ôbest' not in the model
'ideaö' not in the model
'ôsuch' not in the model
'lifeà' not in the model
'21' not in the model
'22' not in the model
'wezy' not in the model
'15' not in the model
'6' not in the model
'100' not in the model
'fourthly' not in the model
'5' not in the model
'16' not in the model
'pony-ride' not in the model
'21-22' not in the model
'21st-22nd' not in the model
'quarrelled' not in the model
'!!' not in the model
'3,000' not in the model
'fewà' 

'23456' not in the model
'17/6/2000' not in the model
'dodgy-looking' not in the model
'big-mouth' not in the model
'pouded' not in the model
'barodo' not in the model
'actor/actress' not in the model
'mobile-phone' not in the model
'19,00' not in the model
'vaisili' not in the model
'solatov' not in the model
'17/06/2000' not in the model
'2.00' not in the model
'2.30' not in the model
'19.15' not in the model
'ann1' not in the model
'19:25' not in the model
'petrol-powered' not in the model
'helgar' not in the model
'svessen' not in the model
'1874' not in the model
'57' not in the model
'gigantically' not in the model
'1986' not in the model
'1988' not in the model
'unforgivably' not in the model
'34' not in the model
'kingfield' not in the model
'gc2' not in the model
'8as' not in the model
'1/6/00' not in the model
'otherstuff' not in the model
'kaczmarski' not in the model
'13.06.2000' not in the model
'bwpm' not in the model
'information-technology' not in the model
'netserver' 

'bao-yu' not in the model
'kang-nam' not in the model
'disturbers' not in the model
'mid-forties' not in the model
'eagle-eyed' not in the model
'well-proven' not in the model
'co-operates' not in the model
'decline-line' not in the model
'32' not in the model
'3,700-metre-high' not in the model
'arolla' not in the model
'1995' not in the model
'steepness' not in the model
'full-bloomed' not in the model
'tiptoed' not in the model
'km/hr' not in the model
'voice-controlled' not in the model
'march.' not in the model
'somary' not in the model
'3c' not in the model
'12-12-00' not in the model
'29' not in the model
'4a' not in the model
'2113' not in the model
'3-6' not in the model
'3:00' not in the model
'6.30' not in the model
'twenty-fourth' not in the model
'carleon' not in the model
'end-of-conference' not in the model
'chêne-bourg' not in the model
'well-situated' not in the model
'bodnant' not in the model
'motorization' not in the model
'35' not in the model
'nyremberg' not in th

In [9]:
# load all dependency pairs parsed using stanford core nlp 3.9.2 (2018-10-05 release), Enhanced++ dependencies were used

# use governor word as center word
# use dependent word as context word
# skip-gram model input will be embeddings of governor, label will be embeddings of [dependent, governor]

BATCH_SIZE = 256
dependency_listing = []
max_length = 15
not_selected = 0

def TokensToEmbeddings(token_list, token_to_embedding):
    output_list = []
    for token in token_list:
        try:
            embedding = token_to_embedding[token]
            if embedding == None:
                embedding = [0 for i in range(EMBEDDING_DIM)]
        except KeyError:
            embedding = [0 for i in range(EMBEDDING_DIM)]
        output_list.extend(embedding)
    return output_list

def TokenToEmbedding(token, token_to_embedding):
    try:
        embedding = token_to_embedding[token]
        if embedding == None:
            embedding = [0 for i in range(EMBEDDING_DIM)]
    except KeyError:
        embedding = [0 for i in range(EMBEDDING_DIM)]
    return embedding

with open('dependencies.corrected.txt', 'r', encoding='utf-8') as textfile:
    metadata = json.load(textfile)
    for sentence in metadata:
        sentence_dependencies = {}
        for dep_relation in sentence:
            if dep_relation['dep'] == 'ROOT':
                continue # skip the root token as it doesn't have any dependency
            # word to predict is the governor, add all dependencies into the list of governor
            governor = dep_relation['governorGloss'].lower()
            dependent = dep_relation['dependentGloss'].lower()
            if governor not in sentence_dependencies.keys():
                sentence_dependencies[governor] = []
            sentence_dependencies[governor].append(dependent)
        for key, value in sentence_dependencies.items():
            if len(value) > 15:
                not_selected += 1
                continue
            dependency_listing.append([key, value])

print('{} / {} words having more than 15 dependencies not selected'.format(not_selected, len(dependency_listing) + not_selected))

31 / 130287 words having more than 15 dependencies not selected


In [10]:
training_input = []
batch_input = torch.empty((BATCH_SIZE, EMBEDDING_DIM * max_length))
batch_label = torch.empty((BATCH_SIZE, EMBEDDING_DIM))
counter = 0

for (target, data) in dependency_listing:
    counter += 1
    label = TokenToEmbedding(target, token_to_embedding)
    batch_label[counter - 1] = torch.FloatTensor(label)
    
    formatted_input = []
    embeddings = TokensToEmbeddings(data, token_to_embedding)
    formatted_input.extend(embeddings)
    if len(data) < max_length:
        # need padding if there are not enough dependencies
        padding_length = (max_length - len(data)) * EMBEDDING_DIM
        formatted_input.extend([0 for i in range(padding_length)])
    batch_input[counter - 1] = torch.FloatTensor(formatted_input)
    
    if counter % BATCH_SIZE == 0:
        training_input.append([batch_input, batch_label])
        batch_input = torch.zeros((BATCH_SIZE, EMBEDDING_DIM * max_length))
        batch_label = torch.empty((BATCH_SIZE, EMBEDDING_DIM))
        counter = 0


In [11]:
print(len(training_input)) # num_batches
print(len(training_input[0])) # input, label
print(training_input[0][0].shape) # batch shape

508
2
torch.Size([256, 1500])


In [10]:
# running with relu activation with softmax (modifications made in models.py)
num_epochs = 50
torch.manual_seed(32)
epoch_loss = torch.zeros(num_epochs)
num_batches = len(training_input)

model_nn = NN(EMBEDDING_DIM * max_length, EMBEDDING_DIM)
learning_rate = 0.01    
optimizer = optim.Adam(model_nn.parameters(), lr=learning_rate)
best_model = 999

for epoch in range (num_epochs):
    model_nn.train()
    total_epoch_loss = 0
    for batch_idx, (data, target) in enumerate(training_input):
        current_batch_size = target.size()[0]
        optimizer.zero_grad()
        output = model_nn(data)
        mask = Variable(torch.ones(current_batch_size), requires_grad=False)
        loss = F.cosine_embedding_loss(output, target, mask) # the model uses cosine embedding loss
        total_epoch_loss += loss
        loss.backward()
        optimizer.step()
        if batch_idx % 250 == 0:
            print('training epoch {}: {} / {} batches loss: {}'.format(epoch, batch_idx, num_batches, loss))
        
    epoch_loss[epoch] = total_epoch_loss / num_batches
    print('training epoch {}: avg epoch loss: {}'.format(epoch, epoch_loss[epoch]))
    if epoch_loss[epoch] < best_model:
        best_model = epoch_loss[epoch]
        print('saving best model at epoch {}'.format(epoch))
        torch.save(model_nn.state_dict, 'nn.softmax.best.model')

training epoch 0: 0 / 508 batches loss: 0.9563223719596863
training epoch 0: 250 / 508 batches loss: 0.3582208454608917
training epoch 0: 500 / 508 batches loss: 0.3946448862552643
training epoch 0: avg epoch loss: 0.3823155462741852
saving best model at epoch 0
training epoch 1: 0 / 508 batches loss: 0.34397491812705994
training epoch 1: 250 / 508 batches loss: 0.3492582440376282
training epoch 1: 500 / 508 batches loss: 0.3903696835041046
training epoch 1: avg epoch loss: 0.3640452027320862
saving best model at epoch 1
training epoch 2: 0 / 508 batches loss: 0.34080439805984497
training epoch 2: 250 / 508 batches loss: 0.343002587556839
training epoch 2: 500 / 508 batches loss: 0.38806384801864624
training epoch 2: avg epoch loss: 0.35953980684280396
saving best model at epoch 2
training epoch 3: 0 / 508 batches loss: 0.33852696418762207
training epoch 3: 250 / 508 batches loss: 0.33767566084861755
training epoch 3: 500 / 508 batches loss: 0.38564181327819824
training epoch 3: avg ep

training epoch 31: 0 / 508 batches loss: 0.3201550543308258
training epoch 31: 250 / 508 batches loss: 0.3100365400314331
training epoch 31: 500 / 508 batches loss: 0.3649429976940155
training epoch 31: avg epoch loss: 0.33290788531303406
saving best model at epoch 31
training epoch 32: 0 / 508 batches loss: 0.3199741840362549
training epoch 32: 250 / 508 batches loss: 0.30972909927368164
training epoch 32: 500 / 508 batches loss: 0.3651171624660492
training epoch 32: avg epoch loss: 0.332588255405426
saving best model at epoch 32
training epoch 33: 0 / 508 batches loss: 0.31970301270484924
training epoch 33: 250 / 508 batches loss: 0.30954957008361816
training epoch 33: 500 / 508 batches loss: 0.36422476172447205
training epoch 33: avg epoch loss: 0.3322709798812866
saving best model at epoch 33
training epoch 34: 0 / 508 batches loss: 0.3192899227142334
training epoch 34: 250 / 508 batches loss: 0.3094882369041443
training epoch 34: 500 / 508 batches loss: 0.36363688111305237
trainin

In [7]:
# running with tanh activation with softmax (modifications made in models.py)
num_epochs = 50
torch.manual_seed(32)
epoch_loss = torch.zeros(num_epochs)
num_batches = len(training_input)

model_nn = NN(EMBEDDING_DIM * max_length, EMBEDDING_DIM)
learning_rate = 0.01    
optimizer = optim.Adam(model_nn.parameters(), lr=learning_rate)
best_model = 999

for epoch in range (num_epochs):
    model_nn.train()
    total_epoch_loss = 0
    for batch_idx, (data, target) in enumerate(training_input):
        current_batch_size = target.size()[0]
        optimizer.zero_grad()
        output = model_nn(data)
        mask = Variable(torch.ones(current_batch_size), requires_grad=False)
        loss = F.cosine_embedding_loss(output, target, mask) # the model uses cosine embedding loss
        total_epoch_loss += loss
        loss.backward()
        optimizer.step()
        if batch_idx % 250 == 0:
            print('training epoch {}: {} / {} batches loss: {}'.format(epoch, batch_idx, num_batches, loss))

    epoch_loss[epoch] = total_epoch_loss / num_batches
    print('training epoch {}: avg epoch loss: {}'.format(epoch, epoch_loss[epoch]))
    if epoch_loss[epoch] < best_model:
        best_model = epoch_loss[epoch]
        print('saving best model at epoch {}'.format(epoch))
        torch.save(model_nn.state_dict, 'nn.tanh.best.model')

training epoch 0: 0 / 508 batches loss: 0.9563081860542297




training epoch 0: 250 / 508 batches loss: 0.36887335777282715
training epoch 0: 500 / 508 batches loss: 0.40389418601989746
training epoch 0: avg epoch loss: 0.39936333894729614
saving best model at epoch 0
training epoch 1: 0 / 508 batches loss: 0.35683301091194153
training epoch 1: 250 / 508 batches loss: 0.35543352365493774
training epoch 1: 500 / 508 batches loss: 0.3989146947860718
training epoch 1: avg epoch loss: 0.3762851655483246
saving best model at epoch 1
training epoch 2: 0 / 508 batches loss: 0.3504186272621155
training epoch 2: 250 / 508 batches loss: 0.3507583439350128
training epoch 2: 500 / 508 batches loss: 0.39630061388015747
training epoch 2: avg epoch loss: 0.37159040570259094
saving best model at epoch 2
training epoch 3: 0 / 508 batches loss: 0.3473049998283386
training epoch 3: 250 / 508 batches loss: 0.3481609523296356
training epoch 3: 500 / 508 batches loss: 0.39462393522262573
training epoch 3: avg epoch loss: 0.36883634328842163
saving best model at epoch 

training epoch 31: 250 / 508 batches loss: 0.3309951424598694
training epoch 31: 500 / 508 batches loss: 0.3842032253742218
training epoch 31: avg epoch loss: 0.3531019687652588
saving best model at epoch 31
training epoch 32: 0 / 508 batches loss: 0.3363994359970093
training epoch 32: 250 / 508 batches loss: 0.33080706000328064
training epoch 32: 500 / 508 batches loss: 0.38410618901252747
training epoch 32: avg epoch loss: 0.3529800772666931
saving best model at epoch 32
training epoch 33: 0 / 508 batches loss: 0.3358014225959778
training epoch 33: 250 / 508 batches loss: 0.3302411139011383
training epoch 33: 500 / 508 batches loss: 0.3836556077003479
training epoch 33: avg epoch loss: 0.3527534604072571
saving best model at epoch 33
training epoch 34: 0 / 508 batches loss: 0.33552777767181396
training epoch 34: 250 / 508 batches loss: 0.330337792634964
training epoch 34: 500 / 508 batches loss: 0.3835183084011078
training epoch 34: avg epoch loss: 0.3526175916194916
saving best mode

In [12]:
# running with relu activation with no softmax (modifications made in models.py)
num_epochs = 50
torch.manual_seed(32)
epoch_loss = torch.zeros(num_epochs)
num_batches = len(training_input)

model_nn = NN(EMBEDDING_DIM * max_length, EMBEDDING_DIM)
learning_rate = 0.01    
optimizer = optim.Adam(model_nn.parameters(), lr=learning_rate)
best_model = 999

for epoch in range (num_epochs):
    model_nn.train()
    total_epoch_loss = 0
    for batch_idx, (data, target) in enumerate(training_input):
        current_batch_size = target.size()[0]
        optimizer.zero_grad()
        output = model_nn(data)
        mask = Variable(torch.ones(current_batch_size), requires_grad=False)
        loss = F.cosine_embedding_loss(output, target, mask) # the model uses cosine embedding loss
        total_epoch_loss += loss
        loss.backward()
        optimizer.step()
        if batch_idx % 250 == 0:
            print('training epoch {}: {} / {} batches loss: {}'.format(epoch, batch_idx, num_batches, loss))

    epoch_loss[epoch] = total_epoch_loss / num_batches
    print('training epoch {}: avg epoch loss: {}'.format(epoch, epoch_loss[epoch]))
    if epoch_loss[epoch] < best_model:
        best_model = epoch_loss[epoch]
        print('saving best model at epoch {}'.format(epoch))
        torch.save(model_nn, 'nn.nosoftmax.best.model')

training epoch 0: 0 / 508 batches loss: 1.0272125005722046
training epoch 0: 250 / 508 batches loss: 0.2708292305469513
training epoch 0: 500 / 508 batches loss: 0.30863702297210693
training epoch 0: avg epoch loss: 0.29028478264808655
saving best model at epoch 0
training epoch 1: 0 / 508 batches loss: 0.2718968987464905
training epoch 1: 250 / 508 batches loss: 0.24587306380271912
training epoch 1: 500 / 508 batches loss: 0.2986571192741394
training epoch 1: avg epoch loss: 0.26780298352241516
saving best model at epoch 1
training epoch 2: 0 / 508 batches loss: 0.26715633273124695
training epoch 2: 250 / 508 batches loss: 0.23650933802127838
training epoch 2: 500 / 508 batches loss: 0.29206669330596924
training epoch 2: avg epoch loss: 0.2598472833633423
saving best model at epoch 2
training epoch 3: 0 / 508 batches loss: 0.2639191746711731
training epoch 3: 250 / 508 batches loss: 0.2306453287601471
training epoch 3: 500 / 508 batches loss: 0.28679659962654114
training epoch 3: avg 

training epoch 31: 250 / 508 batches loss: 0.20196767151355743
training epoch 31: 500 / 508 batches loss: 0.25800472497940063
training epoch 31: avg epoch loss: 0.2197038233280182
saving best model at epoch 31
training epoch 32: 0 / 508 batches loss: 0.21762454509735107
training epoch 32: 250 / 508 batches loss: 0.20212188363075256
training epoch 32: 500 / 508 batches loss: 0.2576419711112976
training epoch 32: avg epoch loss: 0.2192840576171875
saving best model at epoch 32
training epoch 33: 0 / 508 batches loss: 0.2179744392633438
training epoch 33: 250 / 508 batches loss: 0.20120635628700256
training epoch 33: 500 / 508 batches loss: 0.2578238844871521
training epoch 33: avg epoch loss: 0.21896842122077942
saving best model at epoch 33
training epoch 34: 0 / 508 batches loss: 0.21740181744098663
training epoch 34: 250 / 508 batches loss: 0.20090530812740326
training epoch 34: 500 / 508 batches loss: 0.2572774887084961
training epoch 34: avg epoch loss: 0.21857377886772156
saving be

In [13]:
epoch_loss

tensor([0.2903, 0.2678, 0.2598, 0.2544, 0.2504, 0.2471, 0.2444, 0.2421, 0.2400,
        0.2382, 0.2364, 0.2349, 0.2334, 0.2321, 0.2309, 0.2298, 0.2288, 0.2278,
        0.2270, 0.2262, 0.2254, 0.2247, 0.2240, 0.2235, 0.2229, 0.2224, 0.2219,
        0.2214, 0.2209, 0.2205, 0.2201, 0.2197, 0.2193, 0.2190, 0.2186, 0.2182,
        0.2179, 0.2176, 0.2173, 0.2170, 0.2168, 0.2165, 0.2162, 0.2160, 0.2157,
        0.2155, 0.2153, 0.2151, 0.2149, 0.2146], grad_fn=<CopySlices>)