# Word2Vec

Let's build a replication of word2vec from scratch!

Word2Vec is build off a few fundamental ideas which are important to understand
- Distributional semantics ("a word is known by the company it keeps")
- We can take neural network weights and directly use them as word vectors

There are two versions of Word2Vec
- Skip gram: predict word from its context
- CBOW: predict context from a word

Each of them is built by one-hot encoding each word and using a sliding window over corpus of text- training a 1-hidden-layer NN to predict (using tasks from above).

We can then use the weights as vectors (since we've encoded meaning that should roughly correspond to their distributional semantics- e.g. blue should have similar vector to green)

# Data + Pre-processing
We first need to actually load some data for this task- and preprocess/tokenize it

In [1]:
import torch
import string
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')


### Step 2: Define Dataset Class


class Word2VecDataset(Dataset):
    def __init__(self, path
                 , context_size=2, transform=None):
        """
        path: Path to the text file.
        context_size: Number of words on each side of the target word to include in the context.
        transform: Any transformation to apply to the list of words.
        """
        self.transform = transform
        self.context_size = context_size

        # Open, read, and preprocess the text
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read().lower()  # Convert to lowercase
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
            self.words = word_tokenize(text)  # Tokenize into words

        if self.transform:
            self.words = self.transform(self.words)

        self.vocab = self.build_vocab(self.words)
        self.word_indices = [self.vocab[word] for word in self.words]

        # Prepare training instances (context, target)
        self.data = []
        print(f"Iterating from {context_size} to {len(self.word_indices) - context_size}")
        print(f"First block will be {-context_size + context_size} to {context_size + context_size + 1}")


        for i in range(context_size, len(self.word_indices) - context_size):
            context = torch.tensor([self.word_indices[i + j] for j in range(-context_size, context_size + 1) if j != 0])
            target = torch.tensor([self.word_indices[i]])
            [i]
            self.data.append((context, target))

    def build_vocab(self, words):
        """
        Build vocabulary: Mapping of word -> unique index
        """
        word_counts = Counter(words)
        # Start indexing from 1; 0 will be used for padding if necessary
        vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.items())}
        return vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def get_vocab_size(self):
        return len(self.vocab)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]


dataset = Word2VecDataset(path='shake.txt', context_size=2, transform=remove_stopwords)
print(f"Dataset size: {len(dataset)}")
print(f"Vocabulary size: {dataset.get_vocab_size()}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Iterating from 2 to 481446
First block will be 0 to 5
Dataset size: 481444
Vocabulary size: 28207


In [3]:
import torch.nn as nn
import torch.nn.functional as F

class SkipGramModel(nn.Module):

  def __init__(self, vocab_size, embedding_size):
    super(SkipGramModel, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size

    self.embeddings = nn.Embedding(vocab_size, embedding_size)
    self.linear_layer= nn.Linear(embedding_size, vocab_size)

  def forward(self, word_idx):
    embedding = self.embeddings(word_idx)
    output = self.linear_layer(embedding)
    log_probs = F.log_softmax(output, dim=-1)


    return log_probs

    # self.


In [4]:

dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

vocab_size = dataset.get_vocab_size()
embedding_size = 1536
model = SkipGramModel(vocab_size, embedding_size)

In [5]:
import torch.optim as optim

loss_fn = nn.NLLLoss()

optimizer = optim.Adam(model.parameters())


In [13]:
epochs = 2
device = torch.device("cuda")
for epoch in range(epochs):
    total_loss = 0
    for (context, target) in dataloader:

        # context and target initially [batch_size, context_size], [batch_size, 1]
        batch_size, context_size = context.size()

        context, target = context.to(device), target.to(device).squeeze(-1)

        # Flatten context and repeat target to match each context word
        context = context.view(-1)  # Flatten to [batch_size * context_size]
        target = target.repeat_interleave(context_size)  # Repeat each target word `context_size` times

        optimizer.zero_grad()
        model.to(device)
        # Model expects flattened context now
        output = model(context)

        loss = loss_fn(output, target)

        loss.backward()
        optimizer.step()

        print(f"loss {loss}")
        # total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [14]:

device = torch.device("cuda")

epochs = 2
for epoch in range(epochs):
    total_loss = 0
    for (context, target) in dataloader:
        context, target = context.to(device), target.to(device)
        print(context.shape, target.shape)
        model.to(device)

        optimizer.zero_grad()

        output = model(context)
        loss = loss_fn(output, target.squeeze(-1))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss over an epoch
    avg_loss = total_loss / len(dataloader)
    # loss_values.append(avg_loss)

    print(f"Epoch {epoch+1}/{epoch}, Loss: {avg_loss:.4f}")







torch.Size([128, 4]) torch.Size([128, 1])


RuntimeError: Expected target size [128, 28207], got [128]

In [51]:
value

[[tensor([ 6452,   513,   627,  9154,  3572,   567,  1116,   833,   550,  1508,
           2146,  9070,   149,  4908,  5436,  6047,  4588,  1185,  4740,   833,
           3716,  8105,  4812,   555, 14334,   555,  2045,  5843,  1316,   957,
            443,  2298,  1366, 11341,  3569,  2305,   640,   613, 14344, 21650,
            488, 11810,  1157,   764,   968,  3451, 14845, 12020,   398,  1634,
           4725,   822,  1478,   537,  1765,  3595, 16413,  1282,  7628,   846,
           6460, 10592,   115,   537]),
  tensor([12403,  7543,  2650,   442,   159,   970,   136,  1418,  1156,  3106,
           5809,  9071,   150,  3126,  7480, 13033,  3588,  2038,  1069,  2150,
          13353,   980,  1083,   626,  2184,  4478,  2058,  5576,  1273,  4509,
           3568,  3126, 17251,    55,   125,   623,  1107,  2189,   115,  2045,
           3127,  3061, 14889,  1503,  3787,   282, 16239,   654,  1840, 10764,
             44,   817,  1299,  3480,   956, 14346,    74,  2035,  3672,  3080,


In [30]:
value[0]

[tensor([  113,   142,  9799,  1341,  1566,  3568,  6087,  3926,  2652,   465,
          9482,   829,   627, 13324,    55, 10434,   726,  6060,  4975, 16067,
          4041,  1895,  3595,  7037,  3920,  2146,   567,  4213,   838,  4975,
           735,   567,  1829,   646,   537,   578,   567,  5383,   943,   403,
          4590,  1262,  1116, 14906,  5041,  2865,  7639,    47,  9628,  5725,
            38,  2028,  3879,  7838,  2373,   223,   487,  5823,  3402,  6846,
          1240,  2403,  8537,  1865]),
 tensor([12941, 13317,   775, 21162,  2146,  1316,  6777,   973,  1185,  3521,
           590,   150,  2211,    91,  1203,   575,   537,   833,  1561,  4988,
          1733,  6441, 13300, 14845,   537,   297,  6074,  1008, 18651,  1698,
          6536,  4135,   282,  3293,   950,   740,   661,  6194,  4609,  3407,
           692,  1174,  1004,   911,  6288,   683,   699,    48,  3804,  6258,
           867,  4513,  6246,   654,  5739,   290,  5436,  2146,  2146,  1924,
           17