# Computing Word Embeddings

Creating continuous bag of word embeddings using tiny shakespeare dataset.

In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import re
import random
import pandas as pd

torch.manual_seed(1)

<torch._C.Generator at 0x1f57fd87ef0>

In [65]:
# Load in tiny shakespeare dataset
# Read in train, val and test datasets
with open("tiny_shakespeare/train.csv", 'r') as f:
    csvreader = pd.read_csv(f)
    train_data = csvreader["text"].values[0]

with open("tiny_shakespeare/validation.csv", 'r') as f:
    csvreader = pd.read_csv(f)
    val_data = csvreader["text"].values[0]

with open("tiny_shakespeare/test.csv", 'r') as f:
    csvreader = pd.read_csv(f)
    test_data = csvreader["text"].values[0]

raw_text = train_data.lower()

In [66]:
# Create set of vocab
pattern = r'\w+'
words = re.findall(pattern, raw_text)
vocab = set(words)
vocab_size = len(vocab)
print(words[:20])

['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', 'all', 'speak', 'speak', 'first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather']


In [67]:
# Manipulate data into context windows
CONTEXT_SIZE = 2 # 2 tokens either side of target token
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
text_data = []
tensor_data = []
for i in range(CONTEXT_SIZE, len(words) - CONTEXT_SIZE):
    context = (
        [words[i-j-1] for j in range(CONTEXT_SIZE)]
        + [words[i+j+1] for j in range(CONTEXT_SIZE)]
    )
    target = words[i]
    # Create corresponding context and targets with strings and PyTorch tensors
    text_data.append((context, target))
    context_tensor = torch.tensor([word_to_idx[w] for w in context], dtype=torch.long)
    target_tensor = torch.tensor(word_to_idx[target], dtype=torch.long)
    tensor_data.append((context_tensor, target_tensor))
print(text_data[:3])
print(tensor_data[:3])    

[(['citizen', 'first', 'we', 'proceed'], 'before'), (['before', 'citizen', 'proceed', 'any'], 'we'), (['we', 'before', 'any', 'further'], 'proceed')]
[(tensor([10804, 10040,   555,  6813]), tensor(8411)), (tensor([ 8411, 10804,  6813,  5335]), tensor(555)), (tensor([ 555, 8411, 5335, 7473]), tensor(6813))]


In [68]:
# Define a basic Continuous Bag of Words Model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded_context = self.embedding(context).mean(dim=1)
        output = self.fc(embedded_context)
        return output
    
# Define batch size etc.
batch_size = 64
num_batches = len(tensor_data) // batch_size

# Define model and training parameters
model = CBOWModel(vocab_size, embedding_dim=100)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=0.001)

for epoch in range(10):
    random.shuffle(tensor_data)
    running_loss = 0
    for i in range(num_batches):
        batch_data = tensor_data[i * batch_size : (i + 1) * batch_size]
        context, target = zip(*batch_data)
        context = torch.stack(context)
        target = torch.stack(target)

        optimizer.zero_grad()
        output = model(context)
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss}")
    

Epoch 1, Loss: 27337.00258541107
Epoch 2, Loss: 27243.76197910309
Epoch 3, Loss: 27150.61828804016
Epoch 4, Loss: 27057.45508289337
Epoch 5, Loss: 26964.321791648865
Epoch 6, Loss: 26871.26375102997
Epoch 7, Loss: 26778.26854610443
Epoch 8, Loss: 26685.176803588867
Epoch 9, Loss: 26592.124106407166
Epoch 10, Loss: 26499.101897239685


In [103]:
# Taking a look at some predictions
context, target = tensor_data[7890]
output = model(context.unsqueeze(0))
prediction = output.argmax().item()
print(prediction, target.item())
print(idx_to_word[prediction], idx_to_word[target.item()])

context_words = context.tolist()
context_words = [idx_to_word[w] for w in context_words]
print(context_words[1] + ' ' + context_words[0] + ' '  + "xxx" + ' '  + context_words[2] + ' ' + context_words[3])

3440 9556
and senator
dinner first xxx you are


In [104]:
test_word = "people"
test_word_embedding = model.embedding.weight[word_to_idx[test_word]]
similarities = F.cosine_similarity(test_word_embedding.unsqueeze(0), model.embedding.weight, dim=1)

top_similarities, top_indices = torch.topk(similarities, 4)
similar_indices = top_indices.tolist()
similar_words = [idx_to_word[w] for w in similar_indices]
' '.join(similar_words)

'people leaving accompt basin'