In [1]:
# download dataset
import re
import nltk
import itertools
nltk.download('brown')
from nltk.corpus import brown

corpus = []
for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text = text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])

[nltk_data] Downloading package brown to /Users/marina/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(word_counts.values())
    word_probs = {word: count / float(sum_word_counts) for word, count in word_counts.items()}
    for text in corpus:
        filtered_text = []
        for word in text:
            if random.random() < (1 + math.sqrt(word_probs[word] * 1e3)) * 1e-3 / word_probs[word]:
                filtered_text.append(word)
        filtered_corpus.append(filtered_text)
    return filtered_corpus
original = len(list(itertools.chain.from_iterable(corpus)))
corpus = subsample_frequent_words(corpus)
subsampled = len(list(itertools.chain.from_iterable(corpus)))
vocabulary = set(itertools.chain.from_iterable(corpus))
word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}
print(f'The difference in the amount of words after subsampling is {original - subsampled}')

The difference in the amount of words after subsampling is 19137


In [3]:
import numpy as np
from numpy.random import multinomial

def sample_negative(sample_size):
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    sample_probability = {word: (count**0.75) / normalizing_factor for word, count in word_counts.items()}
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                word_list.append(words[index])
        yield word_list

In [4]:
# to run it faster
corpus = corpus[:20]

In [5]:
context_tuple_list = []
window_size = 4
negative_samples = sample_negative(8)

for text in corpus:
    for i, word in enumerate(text):
        first_context_index = max(0, i - window_size)
        last_context_index = min(i + window_size + 1, len(text))
        for j in range(first_context_index, last_context_index):
            if i != j:
                # Each tuple: (target, context, negative_samples)
                context_tuple_list.append((word, text[j], next(negative_samples)))
print("There are {} (target, context, negative) pairs.".format(len(context_tuple_list)))

There are 246632 (target, context, negative) pairs.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Word2Vec(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)  # (vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)  # (vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        # target_word: (1,) 
        # context_word: (1,)
        # negative_example: (1, num_negative_samples(8))
        emb_target = self.embeddings_target(target_word)  # (1, embedding_size)
        emb_context = self.embeddings_context(context_word)  # (1, embedding_size)
        
        positive_dot = torch.mul(emb_context, emb_target)  # (1, embedding_size)
        positive_dot = torch.sum(positive_dot, dim=1)  # (1,)
        loss = torch.sum(F.logsigmoid(positive_dot))  # scalar
        
        emb_negative = self.embeddings_context(negative_example)  # (1, num_negative_samples(8), embedding_size)
        negative_dot = torch.bmm(emb_negative, emb_target.unsqueeze(2))  # (1, num_negative_samples, 1)
        negative_dot = torch.sum(negative_dot, dim=1).squeeze()  # (1,)
        
        loss += torch.sum(F.logsigmoid(-negative_dot))  # scalar

        return -loss  # scalar


In [8]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.5):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100 * gain, 2)))
        return gain < self.min_percent_gain


In [9]:
import random
import torch.autograd as autograd

def get_batches(context_tuple_list, batch_size=100):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        target_word, context_word, negative_words = context_tuple_list[i]
        batch_target.append(word_to_index[target_word])
        batch_context.append(word_to_index[context_word])
        # convert list of negative words to their indices
        batch_negative.append([word_to_index[w] for w in negative_words])
        if (i + 1) % batch_size == 0 or i == len(context_tuple_list) - 1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            tensor_negative = autograd.Variable(torch.from_numpy(np.array(batch_negative)).long())
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [10]:
import torch.optim as optim
import numpy as np

vocabulary_size = len(vocabulary)
net = Word2Vec(embedding_size=200, vocab_size=vocabulary_size)
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for target_tensor, context_tensor, negative_tensor in context_tuple_batches:
        net.zero_grad()
        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    avg_loss = np.mean(losses)
    print("Loss:", avg_loss)
    early_stopping.update_loss(avg_loss)
    if early_stopping.stop_training():
        break

Loss: 42397.76349861391
Loss: 38790.30517578125
Loss gain: 8.51%
Loss: 35493.140829763106
Loss gain: 16.29%
Loss: 32383.90882528982
Loss gain: 23.62%
Loss: 29459.450848979333
Loss gain: 30.52%
Loss: 26726.051332535284
Loss gain: 31.1%
Loss: 24193.03875535534
Loss gain: 31.84%
Loss: 21871.57712481099
Loss gain: 32.46%
Loss: 19766.390569871473
Loss gain: 32.9%
Loss: 17872.488990045364
Loss gain: 33.13%
Loss: 16179.082220262097
Loss gain: 33.13%
Loss: 14671.6027359501
Loss gain: 32.92%
Loss: 13330.2945280998
Loss gain: 32.56%
Loss: 12137.003374653477
Loss gain: 32.09%
Loss: 11073.591493668095
Loss gain: 31.56%
Loss: 10124.873763545867
Loss gain: 30.99%
Loss: 9276.044289865802
Loss gain: 30.41%
Loss: 8513.596624558972
Loss gain: 29.85%
Loss: 7826.966893349924
Loss gain: 29.32%
Loss: 7206.751321115801
Loss gain: 28.82%
Loss: 6645.241756316154
Loss gain: 28.36%
Loss: 6136.243212299963
Loss gain: 27.92%
Loss: 5674.303422497165
Loss gain: 27.5%
Loss: 5253.9009999921245
Loss gain: 27.1%
Loss: 4

I decided to calculate words similariy using  the cosine similarity between two vectors because the method given in tutorial, for nearly any word returned 5 words like "the", "for", "and" etc, even though subsampling was applied. However, using cosine similarity still didn't give precise results, which I believe may be due to the small training sample.

In [11]:
import torch
import torch.nn.functional as F

def get_closest_word_cosine(word, topn=5):
    word_distance = []
    emb = net.embeddings_context
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            cosine_sim = F.cosine_similarity(v_i, v_j, dim=1)
            word_distance.append((index_to_word[j], float(cosine_sim)))
    
    word_distance.sort(key=lambda x: x[1], reverse=True)
    return word_distance[:topn]

In [12]:
get_closest_word_cosine("government", topn=5)

[('countin', 0.24084702134132385),
 ('remember', 0.23879709839820862),
 ('busied', 0.23643550276756287),
 ('ltd', 0.2311418205499649),
 ('purged', 0.23105531930923462)]