# Implementing $word2vec$ using PyTorch


Resources used:
1. [Word2Vec Pytorch](https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb)
2. [Word2Vec using TensorFlow](https://towardsdatascience.com/learn-word2vec-by-implementing-it-in-tensorflow-45641adaf2ac)
3. [Training a Classifier in Pytorch](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import copy

In [2]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x10cbcaf30>

In [3]:
# Example corpus provided by resource 1
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

In [4]:
def preproc_line(line):
    """
    This is how each entry of the corpus will be preprocessed.
    - Remove whitespaces and newlines
    - Lowercase everything
    - Apostrophes replaced with spaces
    - All other punctuation replaced with spaces
    """
    line = line.strip().lower()
    
#     Replace apostrophes with empty spaces to handle words like can't and don't
    line = line.replace("'", "")
    
    punctuations = [",", "-", "_", ".", "!", '"',
                   "[", "]", ";", ":", "(", ")", 
                    "+", "{", "}", "?", "\/", "\\"
                   ]

    for punc in punctuations:
#         Replace all punctuations with spaces.
        line = line.replace(punc, " ")
    
#     Split line by spaces
    words = line.split()
#     Special case: Remove empty strings
    words = [w for w in words if w != ""]
    
    return words


def preproc_corpus(corpus):
    """
    Calls preproc_line on each entry in the corpus
    """
    tokenized_lines = [preproc_line(line) for line in corpus]
    return tokenized_lines


In [5]:
tokenized_corpus = preproc_corpus(corpus)

In [6]:
# Sanity check
tokenized_corpus

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]

In [7]:
def create_vocab(tokenized_corpus):
    """
    Create a vocabulary from the tokenized corpus.
    
    Create mappings: word_to_idx & idx_to_word
    """
    # Create a vocab of all the unique words
    vocab = set()

    for sentence in tokenized_corpus:
    #     Update the vocab with words from sentence
        vocab.update(sentence)

    word_to_idx = {w: i for i, w in enumerate(vocab)}
    idx_to_word = {i: w for w, i in word_to_idx.items()}
    
    return (vocab, word_to_idx, idx_to_word)

In [8]:
(vocab, word_to_idx, idx_to_word) = create_vocab(tokenized_corpus)

In [9]:
vsize = len(vocab)
vsize

15

In [10]:
# Sanity check
idx_to_word

{0: 'a',
 1: 'berlin',
 2: 'she',
 3: 'germany',
 4: 'woman',
 5: 'paris',
 6: 'warsaw',
 7: 'is',
 8: 'he',
 9: 'queen',
 10: 'man',
 11: 'king',
 12: 'poland',
 13: 'france',
 14: 'capital'}

In [11]:
def skipgram_model(tokenized_corpus, vocab, vsize, word_to_idx, idx_to_word, window_size=2):
    """
    Returns a list of pairs of the form (center, context),
    where the context is within the window size for the given word.
    
    Each item in the tuple is the index for that word.
    """
    pairs = []
    
    for sentence in tokenized_corpus:
#         Replace words with their respective indices.
        sentence = [word_to_idx[w] for w in sentence]
        sen_len = len(sentence)
    
#         Treat each word in sentence as the center.
        for center_pos, center in enumerate(sentence):
#             Slide over the window.
            for slide in range(-window_size, window_size + 1):
                if slide == 0:
                    continue

#                 This is the position of the context word to consider
                context_pos = center_pos + slide

#                 If the context position is out of bounds, ignore
                if context_pos < 0 or context_pos >= sen_len:
                    continue
            
                context = sentence[context_pos]
                pairs.append((center, context))
                
#     Return the pairs as an np array
    return np.array(pairs)

In [12]:
def onehot_encode(vsize, word_idx):
    """
    Given a word_idx, this function will return a Tensor of length
    vsize with a 1.0 placed in the corresponding word_idx
    """
    onehot = torch.zeros(vsize).float()
    onehot[word_idx] = 1.0
    
    return onehot

In [13]:
# Prepare our training data
pairs = skipgram_model(tokenized_corpus, vocab, vsize, word_to_idx, idx_to_word, window_size=2)
onehot_pairs = [(onehot_encode(vsize, x), onehot_encode(vsize, y)) for x, y in pairs]

In [14]:
print(onehot_pairs[0])

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]))


In [15]:
# Define our Word2VecNetwork Class Here
class SkipgramModeler(nn.Module):
    def __init__(self, vocab_size=15, embedding_dim=5):
        super(SkipgramModeler, self).__init__()
        self.w1 = nn.Linear(vocab_size, embedding_dim)
        self.w2 = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        out1 = self.w1(x)
        out2 = self.w2(out1)
#         return out2
        
        log_softmax = F.softmax(out2, dim=0)
        return log_softmax

In [16]:
def train_model(onehot_pairs, vsize, embedding_dim=5, epochs=5, learning_rate=1e-3, opt_alg=optim.SGD, criterion=nn.MSELoss()):
    torch.manual_seed(seed)
    model = SkipgramModeler(vocab_size=vsize, embedding_dim=embedding_dim)
    optimizer = opt_alg(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        loss = 0.0
        for i, onehot_pair in enumerate(onehot_pairs, 0):
            x_i, y_i = onehot_pair

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(x_i)
            loss = criterion(outputs, y_i)
            loss.backward()
            optimizer.step()

            # Print statistics
            loss += loss.item()
#         Print the loss for this epoch
        if (epoch + 1) % 5 == 0:
            print('Epoch: {} loss: {:.3f}'.format(epoch + 1, loss))
    
    return model
    

In [17]:
# For Evaluatuion: use cosine similarity metric
cos = nn.CosineSimilarity()

def argmax_cos(estimate):
    cos = nn.CosineSimilarity()
    costs = [(word, cos(estimate, enc)) for word, enc in encodings.items()]
    item = max(costs, key=lambda x: x[1])
    return item

In [18]:
# Training
model = train_model(onehot_pairs, vsize, embedding_dim=5, epochs=50, learning_rate=1e-3, opt_alg=optim.SGD)

# Evaluation

# Take the encoder from the model out
encoder = copy.deepcopy(model.w1)

# Create encodings of the words to their respective vectors
encodings = {w: encoder(onehot_encode(vsize, idx)) for w, idx in word_to_idx.items()}
# Reshape those encodings to be of dimension (embedding_dim, -1)
encodings = {w: e.reshape(1, -1) for w, e in encodings.items()}


print("=========================")
queen = encodings["queen"]
print("Similarity <queen> to <king - man + woman>")
print(cos(queen, encodings["king"] - encodings["man"] + encodings["woman"]))
print(argmax_cos(encodings["king"] - encodings["man"] + encodings["woman"]))

print("=========================")
paris = encodings["paris"]
print("Similarity <paris> to <berlin - germany + france>")
print(cos(paris, encodings["berlin"] - encodings["germany"] + encodings["france"]))
print(argmax_cos(encodings["berlin"] - encodings["germany"] + encodings["france"]))

Epoch: 5 loss: 0.126
Epoch: 10 loss: 0.126
Epoch: 15 loss: 0.126
Epoch: 20 loss: 0.126
Epoch: 25 loss: 0.126
Epoch: 30 loss: 0.126
Epoch: 35 loss: 0.126
Epoch: 40 loss: 0.126
Epoch: 45 loss: 0.126
Epoch: 50 loss: 0.126
Similarity <queen> to <king - man + woman>
tensor([0.6636], grad_fn=<DivBackward1>)
('woman', tensor([0.8293], grad_fn=<DivBackward1>))
Similarity <paris> to <berlin - germany + france>
tensor([-0.3340], grad_fn=<DivBackward1>)
('he', tensor([0.6638], grad_fn=<DivBackward1>))


In [19]:
# Try ADAM
# Training
model = train_model(onehot_pairs, vsize, embedding_dim=5, epochs=50, learning_rate=1e-3, opt_alg=optim.Adam)

# Evaluation

# Take the encoder from the model out
encoder = copy.deepcopy(model.w1)

# Create encodings of the words to their respective vectors
encodings = {w: encoder(onehot_encode(vsize, idx)) for w, idx in word_to_idx.items()}
# Reshape those encodings to be of dimension (embedding_dim, -1)
encodings = {w: e.reshape(1, -1) for w, e in encodings.items()}


print("=========================")
queen = encodings["queen"]
print("Similarity <queen> to <king - man + woman>")
print(cos(queen, encodings["king"] - encodings["man"] + encodings["woman"]))
print(argmax_cos(encodings["king"] - encodings["man"] + encodings["woman"]))

print("=========================")
paris = encodings["paris"]
print("Similarity <paris> to <berlin - germany + france>")
print(cos(paris, encodings["berlin"] - encodings["germany"] + encodings["france"]))
print(argmax_cos(encodings["berlin"] - encodings["germany"] + encodings["france"]))

Epoch: 5 loss: 0.127
Epoch: 10 loss: 0.131
Epoch: 15 loss: 0.137
Epoch: 20 loss: 0.142
Epoch: 25 loss: 0.146
Epoch: 30 loss: 0.148
Epoch: 35 loss: 0.150
Epoch: 40 loss: 0.152
Epoch: 45 loss: 0.153
Epoch: 50 loss: 0.155
Similarity <queen> to <king - man + woman>
tensor([0.9810], grad_fn=<DivBackward1>)
('woman', tensor([0.9935], grad_fn=<DivBackward1>))
Similarity <paris> to <berlin - germany + france>
tensor([0.8325], grad_fn=<DivBackward1>)
('a', tensor([0.9247], grad_fn=<DivBackward1>))


## Batching
Let's try training using batching, which is how the guides we're following has used it.

In [20]:
len(pairs)

70

In [21]:
from torch.utils.data import DataLoader

# Batches up our pairs of data
trainloader = DataLoader(onehot_pairs, batch_size=10, shuffle=True)

In [22]:
def train_model_batches(trainloader, vsize, embedding_dim=5, epochs=50, learning_rate=1e-3, opt_alg=optim.SGD, criterion=nn.MSELoss()):
    torch.manual_seed(seed)
    model = SkipgramModeler(vocab_size=vsize, embedding_dim=embedding_dim)
    optimizer = opt_alg(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        loss = 0.0
        for batch_i, batch in enumerate(trainloader):
            
            X, y = batch

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            # Print statistics
            loss += loss.item()
#         Print the loss for this epoch
        if (epoch + 1) % 5 == 0:
            print('Epoch: {} loss: {:.3f}'.format(epoch + 1, loss))
    
    return model
    

In [23]:
# Training
model = train_model_batches(trainloader, vsize, embedding_dim=5, epochs=50, learning_rate=1e-3, opt_alg=optim.SGD)

# Evaluation

# Take the encoder from the model out
encoder = copy.deepcopy(model.w1)

# Create encodings of the words to their respective vectors
encodings = {w: encoder(onehot_encode(vsize, idx)) for w, idx in word_to_idx.items()}
# Reshape those encodings to be of dimension (embedding_dim, -1)
encodings = {w: e.reshape(1, -1) for w, e in encodings.items()}


print("=========================")
queen = encodings["queen"]
print("Similarity <queen> to <king - man + woman>")
print(cos(queen, encodings["king"] - encodings["man"] + encodings["woman"]))
print(argmax_cos(encodings["king"] - encodings["man"] + encodings["woman"]))

print("=========================")
paris = encodings["paris"]
print("Similarity <paris> to <berlin - germany + france>")
print(cos(paris, encodings["berlin"] - encodings["germany"] + encodings["france"]))
print(argmax_cos(encodings["berlin"] - encodings["germany"] + encodings["france"]))

Epoch: 5 loss: 0.126
Epoch: 10 loss: 0.127
Epoch: 15 loss: 0.127
Epoch: 20 loss: 0.127
Epoch: 25 loss: 0.127
Epoch: 30 loss: 0.126
Epoch: 35 loss: 0.127
Epoch: 40 loss: 0.127
Epoch: 45 loss: 0.127
Epoch: 50 loss: 0.127
Similarity <queen> to <king - man + woman>
tensor([0.6627], grad_fn=<DivBackward1>)
('woman', tensor([0.8278], grad_fn=<DivBackward1>))
Similarity <paris> to <berlin - germany + france>
tensor([-0.3328], grad_fn=<DivBackward1>)
('he', tensor([0.6650], grad_fn=<DivBackward1>))


In [24]:
# Training
model = train_model_batches(trainloader, vsize, embedding_dim=5, epochs=50, learning_rate=1e-3, opt_alg=optim.Adam)

# Evaluation

# Take the encoder from the model out
encoder = copy.deepcopy(model.w1)

# Create encodings of the words to their respective vectors
encodings = {w: encoder(onehot_encode(vsize, idx)) for w, idx in word_to_idx.items()}
# Reshape those encodings to be of dimension (embedding_dim, -1)
encodings = {w: e.reshape(1, -1) for w, e in encodings.items()}


print("=========================")
queen = encodings["queen"]
print("Similarity <queen> to <king - man + woman>")
print(cos(queen, encodings["king"] - encodings["man"] + encodings["woman"]))
print(argmax_cos(encodings["king"] - encodings["man"] + encodings["woman"]))

print("=========================")
paris = encodings["paris"]
print("Similarity <paris> to <berlin - germany + france>")
print(cos(paris, encodings["berlin"] - encodings["germany"] + encodings["france"]))
print(argmax_cos(encodings["berlin"] - encodings["germany"] + encodings["france"]))

Epoch: 5 loss: 0.126
Epoch: 10 loss: 0.127
Epoch: 15 loss: 0.126
Epoch: 20 loss: 0.126
Epoch: 25 loss: 0.126
Epoch: 30 loss: 0.124
Epoch: 35 loss: 0.125
Epoch: 40 loss: 0.126
Epoch: 45 loss: 0.124
Epoch: 50 loss: 0.125
Similarity <queen> to <king - man + woman>
tensor([0.8271], grad_fn=<DivBackward1>)
('woman', tensor([0.9211], grad_fn=<DivBackward1>))
Similarity <paris> to <berlin - germany + france>
tensor([0.0723], grad_fn=<DivBackward1>)
('berlin', tensor([0.6976], grad_fn=<DivBackward1>))


## Let's Try a Real Dataset

In [25]:
dataset = "datasets/english.csv"

with open(dataset, "r") as f:
    lines = f.readlines()
    
# First line is a header, ignore it
data = lines[1:]


# Keep only the first 2k lines
data = data[:2000]

In [26]:
tokenized_corpus = preproc_corpus(data)

# Sanity check
print(tokenized_corpus[0])

['10', 'both', 'paul', 'and', 'james', 'speak', 'of', 'the', 'works', 'of', 'love', 'that', 'one', 'must', 'add', 'to', 'his', 'faith', 'in', 'order', 'to', 'be', 'justified']


In [27]:
(vocab, word_to_idx, idx_to_word) = create_vocab(tokenized_corpus)

In [28]:
vsize = len(vocab)
vsize

10607

In [29]:
%%time

# Prepare our training data
pairs = skipgram_model(tokenized_corpus, vocab, vsize, word_to_idx, idx_to_word, window_size=2)
onehot_pairs = [(onehot_encode(vsize, x), onehot_encode(vsize, y)) for x, y in pairs]

CPU times: user 9.48 s, sys: 6 s, total: 15.5 s
Wall time: 15.9 s


In [30]:
# Batches up our pairs of data
bsize = len(onehot_pairs) // 2000
print("Batch size: {}".format(bsize))

trainloader = DataLoader(onehot_pairs, batch_size=bsize, shuffle=True)

Batch size: 81


In [31]:
%%time

# Training
model = train_model_batches(trainloader, vsize, embedding_dim=25, epochs=50, learning_rate=1e-3, opt_alg=optim.Adam)

# Evaluation

# Take the encoder from the model out
encoder = copy.deepcopy(model.w1)

# Create encodings of the words to their respective vectors
encodings = {w: encoder(onehot_encode(vsize, idx)) for w, idx in word_to_idx.items()}
# Reshape those encodings to be of dimension (embedding_dim, -1)
encodings = {w: e.reshape(1, -1) for w, e in encodings.items()}

Epoch: 5 loss: 0.001
Epoch: 10 loss: 0.001
Epoch: 15 loss: 0.001
Epoch: 20 loss: 0.001
Epoch: 25 loss: 0.001
Epoch: 30 loss: 0.001
Epoch: 35 loss: 0.001
Epoch: 40 loss: 0.001
Epoch: 45 loss: 0.001
Epoch: 50 loss: 0.001
CPU times: user 1h 1min 11s, sys: 11min 21s, total: 1h 12min 33s
Wall time: 5h 22min 12s


In [32]:
print("=========================")
queen = encodings["queen"]
print("Similarity <queen> to <king - man + woman>")
print(cos(queen, encodings["king"] - encodings["man"] + encodings["woman"]))
print(argmax_cos(encodings["king"] - encodings["man"] + encodings["woman"]))

print("=========================")
paris = encodings["paris"]
print("Similarity <paris> to <berlin - germany + france>")
print(cos(paris, encodings["berlin"] - encodings["germany"] + encodings["france"]))
print(argmax_cos(encodings["berlin"] - encodings["germany"] + encodings["france"]))

Similarity <queen> to <king - man + woman>
tensor([0.3572], grad_fn=<DivBackward1>)
('saw', tensor([0.8712], grad_fn=<DivBackward1>))
Similarity <paris> to <berlin - germany + france>
tensor([-0.2552], grad_fn=<DivBackward1>)
('23%', tensor([0.8983], grad_fn=<DivBackward1>))


In [35]:
print("=========================")
man = encodings["man"]
print("Similarity <man> to <woman - girl + boy>")
print(cos(man, encodings["woman"] - encodings["she"] + encodings["he"]))
print(argmax_cos(encodings["woman"] - encodings["she"] + encodings["he"]))

Similarity <man> to <woman - girl + boy>
tensor([0.6128], grad_fn=<DivBackward1>)
('model', tensor([0.9505], grad_fn=<DivBackward1>))


In [36]:
%%time

# Training
model = train_model_batches(trainloader, vsize, embedding_dim=50, epochs=5, learning_rate=1e-3, opt_alg=optim.Adam)

Epoch: 5 loss: 0.001
CPU times: user 8min 18s, sys: 1min 27s, total: 9min 45s
Wall time: 9min 2s


In [37]:
# Evaluation

# Take the encoder from the model out
encoder = copy.deepcopy(model.w1)

# Create encodings of the words to their respective vectors
encodings = {w: encoder(onehot_encode(vsize, idx)) for w, idx in word_to_idx.items()}
# Reshape those encodings to be of dimension (embedding_dim, -1)
encodings = {w: e.reshape(1, -1) for w, e in encodings.items()}

print("=========================")
queen = encodings["queen"]
print("Similarity <queen> to <king - man + woman>")
print(cos(queen, encodings["king"] - encodings["man"] + encodings["woman"]))
print(argmax_cos(encodings["king"] - encodings["man"] + encodings["woman"]))

print("=========================")
paris = encodings["paris"]
print("Similarity <paris> to <berlin - germany + france>")
print(cos(paris, encodings["berlin"] - encodings["germany"] + encodings["france"]))
print(argmax_cos(encodings["berlin"] - encodings["germany"] + encodings["france"]))

Similarity <queen> to <king - man + woman>
tensor([-0.1401], grad_fn=<DivBackward1>)
('king', tensor([0.6685], grad_fn=<DivBackward1>))
Similarity <paris> to <berlin - germany + france>
tensor([0.3327], grad_fn=<DivBackward1>)
('berlin', tensor([0.6183], grad_fn=<DivBackward1>))


In [39]:
print("=========================")
man = encodings["man"]
print("Similarity <man> to <woman - she + he>")
print(cos(man, encodings["woman"] - encodings["she"] + encodings["he"]))
print(argmax_cos(encodings["woman"] - encodings["she"] + encodings["he"]))

Similarity <man> to <woman - she + he>
tensor([0.4101], grad_fn=<DivBackward1>)
('woman', tensor([0.7969], grad_fn=<DivBackward1>))


## Interpreting the Results

__Why is this model so bad?__

1. The corpus may not be big enough (we used only 2k lines, so that's definitely a possibility)
2. The vocabulary size is enormous and there may not be enough examples of the same words being used in similar contexts for us to generate meaningful embeddings