# Computing Word Embeddings

Creating continuous bag of word embeddings using tiny shakespeare dataset.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import re
import pandas as pd

torch.manual_seed(1)

<torch._C.Generator at 0x26f5fb92490>

In [2]:
# Load in tiny shakespeare dataset
# Read in train, val and test datasets
with open("tiny_shakespeare/train.csv", 'r') as f:
    csvreader = pd.read_csv(f)
    train_data = csvreader["text"].values[0]

with open("tiny_shakespeare/validation.csv", 'r') as f:
    csvreader = pd.read_csv(f)
    val_data = csvreader["text"].values[0]

with open("tiny_shakespeare/test.csv", 'r') as f:
    csvreader = pd.read_csv(f)
    test_data = csvreader["text"].values[0]

data = train_data.lower()

In [16]:
# Create set of vocab
pattern = r'\w+'
text = re.findall(pattern, data)
vocab = set(text)
vocab_size = len(vocab)
print(text[:20])

['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', 'all', 'speak', 'speak', 'first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather']


In [17]:
# Manipulate data into context windows
CONTEXT_SIZE = 2 # 2 tokens either side of target token
word_to_idx = {word: i for i, word in enumerate(vocab)}
context_data = []
for i in range(CONTEXT_SIZE, len(text) - CONTEXT_SIZE):
    context = (
        [text[i-j-1] for j in range(CONTEXT_SIZE)]
        + [text[i+j+1] for j in range(CONTEXT_SIZE)]
    )
    target = text[i]
    context_data.append((context, target))
print(context_data[:3])    

[(['citizen', 'first', 'we', 'proceed'], 'before'), (['before', 'citizen', 'proceed', 'any'], 'we'), (['we', 'before', 'any', 'further'], 'proceed')]


In [None]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, inputs):
        embedded_context = self.embedding(context)
        return embedded_context

model = SkipGramModel(vocab_size, embedding_dim=20)
loss_function = nn.CrossEntropyLoss()
opimizer = optim.SGD(model.parameters(),lr=0.001)

for epoch in range(10):
    