In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
# Adapted from this gist: https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0

In [2]:
# The first few sentences of Frankenstein
corpus = """You will rejoice to hear that no disaster has accompanied the commencement of an enterprise 
which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to 
assure my dear sister of my welfare and increasing confidence in the success of my undertaking."""

In [3]:
split_corpus = [word.strip() for word in corpus.split(' ')]
vocabulary = list(set(split_corpus)) # Token should only appear once
vocabulary[:5]

['rejoice', 'undertaking.', 'confidence', 'here', 'you']

In [4]:
vocabulary_indices = list(range(len(vocabulary))) # Each word gets an ordinal
vocabulary_size = len(vocabulary_indices)
word_to_index = dict(zip(vocabulary, vocabulary_indices))  # Map the word to the ordinal
word_to_index['no']

16

In [5]:
WINDOW_SIZE = 2 # How many words ahead and behind to look
corpus_size = len(split_corpus)


index_pairs = []
for position, word in enumerate(split_corpus):
    window_minimum = max(position - WINDOW_SIZE, 0)
    window_maximum = min(position + WINDOW_SIZE + 1, corpus_size)
    
    for window_position in range(window_minimum, window_maximum):
        if position != window_position: # The word itself can't be a context word 
            context_index = word_to_index[split_corpus[window_position]]
            index_pairs.append((context_index, word_to_index[word]))

index_pairs = np.array(index_pairs)

In [6]:
EMBEDDING_DIMENSIONS = 5 # What's the size of each vector?
LEARNING_RATE = 0.001 # How fast do you want to update these vectors?
NUMBER_OF_EPOCHES = 101 # How many times should the model see the corpus?

In [7]:
def one_hot_encoding(position):
    one_hot_vector = torch.zeros(vocabulary_size).float()
    one_hot_vector[position] = 1.0
    return one_hot_vector

In [8]:
InputEmbeddingLayer = Variable(
    torch.randn( # Random numbers from the standard distribution
        EMBEDDING_DIMENSIONS, vocabulary_size
    ).float(), 
    requires_grad=True # This matrix will update
)
OutputEmbeddingLayer = Variable(torch.randn(vocabulary_size, EMBEDDING_DIMENSIONS).float(), requires_grad=True)

In [9]:
for epoch in range(NUMBER_OF_EPOCHES):
    epoch_loss = 0
    for context, target in index_pairs:
        input_vector = Variable(one_hot_encoding(context)).float()
        ground_truth = Variable(torch.from_numpy(np.array([target])).long()) # The ordinal encoding of the word
        
        input_embedding = torch.matmul(InputEmbeddingLayer, input_vector)
        output_embedding = torch.matmul(OutputEmbeddingLayer, input_embedding)
        
        log_softmax = F.log_softmax( # Taking the log of the softmax for optimization reasons
            output_embedding, 
            dim=0, # Calculate column-wise
        )
        loss = F.nll_loss( # The negative-log likelihood: also for optimization reasons
            log_softmax.unsqueeze(0), # Transform from a vector to a matrix
            ground_truth,
        )
        epoch_loss += loss.item() # Check track of the total loss
        
        loss.backward() # Back-propagate (take the derivative of everything)
        
        InputEmbeddingLayer.data -= LEARNING_RATE * InputEmbeddingLayer.grad.data # Update the embedding layers
        OutputEmbeddingLayer.data -= LEARNING_RATE * OutputEmbeddingLayer.grad.data
        
        InputEmbeddingLayer.grad.data.zero_() # Reset the gradients for the next context-word pair
        OutputEmbeddingLayer.grad.data.zero_()
    if epoch % 10 == 0:
        print(epoch_loss / len(index_pairs))
        

5.403818273544312
5.057663822174073
4.79267865607613
4.580320412234256
4.4055736340974505
4.259079512796904
4.134421344807273
4.026941872897901
3.9331591380269906
3.850423762672826
3.7767033024838095
