<a href="https://colab.research.google.com/github/nicholasthomson/Group9-Final-INST377F2021/blob/main/2-language-models-lab/1-word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Word2Vec ##

In this notebook we will go through the step by step creation of the Continouous Bag Of Words (CBOW).
CBOW is an embedded model that makes use of a "fake task" -> [within short window, predict the current word] to extract a vector that shows the relationship between words.

### Continuous Bag Of Words ###

Adapted from Robert Guthrie

In [23]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.linalg

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [25]:
# CBOW is a window view; we are trying to infer the word in the middle.
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right

raw_text= """Long Short-Term Memory (LSTM) is a recurrent neural network (RNN) architecture that has been designed
to address the vanishing and exploding gradient problems of conventional RNNs. Unlike feedforward neural networks,
RNNs have cyclic connections making them powerful for modeling sequences.
They have been successfully used for sequence labeling and sequence prediction tasks,
such as handwriting recognition, language modeling, phonetic labeling of acoustic frames. However, in contrast to the deep neural
networks, the use of RNNs in speech recognition has been limited to phone recognition in small scale tasks.
In this paper, we present novel LSTM based RNN architectures which make more effective
use of model parameters to train acoustic models for large vocabulary speech recognition.
We train and compare LSTM, RNN and DNN models at various numbers of parameters and configurations.
We show that LSTM models converge quickly and give state of the art speech recognition performance for relatively small sized models.""".split()

# By deriving a set from "raw_text", we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

# Basic Tokenizer
word_to_ix = {word: i for i, word in enumerate(vocab)}

print(len(raw_text))
print(vocab_size)

152
106


In [26]:
print(vocab)

{'is', 'more', 'cyclic', 'designed', 'make', 'give', 'state', 'Unlike', '(RNN)', 'limited', 'compare', 'as', 'modeling,', 'relatively', 'We', 'which', 'use', 'modeling', 'vanishing', 'used', 'recognition,', 'They', 'speech', 'small', 'deep', 'tasks.', 'the', 'language', 'numbers', 'models.', 'for', 'network', 'In', 'paper,', 'present', 'recurrent', 'feedforward', 'and', 'successfully', 'parameters', '(LSTM)', 'large', 'Long', 'Memory', 'conventional', 'tasks,', 'networks,', 'problems', 'LSTM', 'However,', 'scale', 'that', 'address', 'sequences.', 'LSTM,', 'have', 'exploding', 'powerful', 'phonetic', 'RNN', 'connections', 'recognition.', 'to', 'acoustic', 'phone', 'been', 'quickly', 'based', 'show', 'effective', 'architectures', 'frames.', 'contrast', 'of', 'has', 'in', 'sized', 'novel', 'handwriting', 'sequence', 'recognition', 'RNNs', 'performance', 'labeling', 'model', 'RNNs.', 'configurations.', 'various', 'Short-Term', 'we', 'art', 'DNN', 'them', 'gradient', 'such', 'a', 'this', 'a

In [27]:
# list out keys and values separately
key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())

In [28]:
# Now lets create a "dataset"
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = []
    for j in range(CONTEXT_SIZE, 0, -1):
        context.append(raw_text[i - j])

    for j in range(1, CONTEXT_SIZE + 1):
        context.append(raw_text[i + j])

    target = raw_text[i]
    data.append((context, target))
print(data[:5])


[(['Long', 'Short-Term', '(LSTM)', 'is'], 'Memory'), (['Short-Term', 'Memory', 'is', 'a'], '(LSTM)'), (['Memory', '(LSTM)', 'a', 'recurrent'], 'is'), (['(LSTM)', 'is', 'recurrent', 'neural'], 'a'), (['is', 'a', 'neural', 'network'], 'recurrent')]


### Create the CBOW Model (as we have seen already other ANN) ###

We have to extend from nn.Module as all the other ANN

In [29]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embed_dim, context, hidden_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Sequential(
            nn.Linear(context*embed_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
            nn.LogSoftmax(dim = -1)
        )

    def forward(self, inputs):
#         print(inputs.shape)
#         print(inputs)
        out = self.embedding(inputs)
#         print(out.shape)
        out = out.view(1, -1)
#         print(out.shape)
        out = self.linear(out)
#         print(out.shape)
        return out

    # This is what we are actually interested on
    def get_word_vector(self, word):
        out = self.embedding(word)
        return out


#### Lets break it down! ####

In [30]:
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
BATCH_SIZE = 6
FULL_CONTEXT_SIZE = CONTEXT_SIZE * 2
HIDDEN_SIZE = 256

example_tensor = torch.randint(0, VOCAB_SIZE, [BATCH_SIZE, FULL_CONTEXT_SIZE])
print(example_tensor)

tensor([[ 24,  55,  85,  85],
        [ 32,  72,  40, 102],
        [ 67,  24,  97, 101],
        [ 92,  96,   3, 100],
        [ 94,  69,  41,  18],
        [ 66,  10, 101,  69]])


In [31]:
CBOW_embedding = nn.Embedding(VOCAB_SIZE, EMBEDD_DIM)
print(example_tensor.shape)
example_result = CBOW_embedding(example_tensor)
# Now we have a representation of the words in a vector of EMBEDD_DIM Dimensions
print(example_result.shape)
# example_result = torch.flatten(example_result, start_dim=1)
example_result = example_result.view(BATCH_SIZE, -1)
print(example_result.shape)

torch.Size([6, 4])
torch.Size([6, 4, 10])
torch.Size([6, 40])


In [32]:
print("input shape: ", EMBEDD_DIM * FULL_CONTEXT_SIZE)
print("output shape: ", HIDDEN_SIZE)
CBOW_hidden = nn.Linear(EMBEDD_DIM * FULL_CONTEXT_SIZE, HIDDEN_SIZE)
CBOW_hidden_relu = nn.ReLU()
example_result = CBOW_hidden(example_result)
example_result = CBOW_hidden_relu(example_result)
print(example_result.shape)

input shape:  40
output shape:  256
torch.Size([6, 256])


In [33]:
CBOW_output = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE)
CBOW_output_soft = nn.LogSoftmax(dim = -1)
example_result = CBOW_output(example_result)
example_result = CBOW_output_soft(example_result)
print(example_result.shape)

torch.Size([6, 106])


In [34]:
print(example_result[0].argmax(-1))
print(key_list[val_list.index(example_result[0].argmax(-1))])
print(example_result[0])
print(example_result[1].argmax(-1))
print(key_list[val_list.index(example_result[1].argmax(-1))])
print(example_result[1])
# print(example_result[2].argmax(-1))
# print(example_result[3].argmax(-1))
# print(example_result[4].argmax(-1))

tensor(6)
state
tensor([-4.4380, -4.8893, -4.6922, -4.7849, -4.4376, -4.8593, -4.3260, -4.6786,
        -5.0745, -4.5803, -4.6013, -4.7131, -4.6678, -4.8053, -4.7313, -4.5460,
        -4.6009, -4.7773, -4.6341, -4.3517, -4.4937, -4.5001, -5.0043, -5.0883,
        -4.4794, -4.5455, -5.2898, -4.5757, -4.7023, -4.5595, -4.6687, -4.5512,
        -4.5903, -4.7904, -4.6428, -4.6684, -4.4965, -4.8508, -4.8107, -4.7865,
        -4.6480, -4.8759, -4.3305, -4.8739, -4.3641, -4.8503, -4.8028, -4.7392,
        -5.0739, -4.4379, -4.8810, -4.6924, -4.8655, -4.6130, -4.7124, -4.5794,
        -4.7203, -5.0039, -4.8614, -4.4612, -4.4278, -4.5133, -4.7577, -4.4643,
        -4.5912, -4.6819, -4.9158, -4.6950, -4.8721, -4.3874, -4.9480, -4.5810,
        -4.7410, -4.4939, -4.6923, -4.7090, -4.7609, -4.5720, -4.6765, -4.7862,
        -4.5787, -4.5850, -4.5591, -4.8904, -4.8443, -4.5335, -5.0053, -4.4781,
        -4.6597, -4.5866, -4.4569, -4.6788, -4.6805, -4.6006, -4.5684, -4.6509,
        -4.4574, -4.5540

## Back to the notebook ##

In [35]:
# Simple helper method to transform the context to the expected int vector - tensor

def make_context_vector(context, word_to_ix, debug=False):
    if debug:
      print(context)
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

make_context_vector(data[0][0], word_to_ix, debug=True)

['Long', 'Short-Term', '(LSTM)', 'is']


tensor([42, 88, 40,  0])

In [36]:
def train(model, epochs, data, optimizer, loss_fn):
    model.train()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for context, target in data:

            # Prepare inputs and targets
            context_idxs = make_context_vector(context, word_to_ix)
            context_idxs = context_idxs.to(device)
            target_id = make_context_vector([target], word_to_ix)
            target_id = target_id.to(device)

            # Do not accumulate
            model.zero_grad()

            # Step 3. Run the forward pass
            log_probs = model(context_idxs)
    #         break

            # Step 4. Compute your loss function.
            loss = loss_fn(log_probs, target_id)

    #         loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

            # Step 5. Do the backward pass and update the gradient
            loss.backward()
            optimizer.step()

            # Get the Python number from a 1-element Tensor by calling tensor.item()
            total_loss += loss.item()
        losses.append(total_loss)
    return losses


In [37]:
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
BATCH_SIZE = 6
FULL_CONTEXT_SIZE = CONTEXT_SIZE * 2
HIDDEN_SIZE = 256

loss_function = nn.NLLLoss() # Because we are using Log_softmax
model = CBOW(vocab_size, EMBEDD_DIM, FULL_CONTEXT_SIZE, HIDDEN_SIZE)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

losses = train(model, 100, data, optimizer, loss_function)
model.eval()

print(losses)  # The loss decreased every iteration over the training data!

[693.7842440605164, 687.0002851486206, 680.3075046539307, 673.7027094364166, 667.1799705028534, 660.7383699417114, 654.3736503124237, 648.0804352760315, 641.8558895587921, 635.6958141326904, 629.5938115119934, 623.5473554134369, 617.5566489696503, 611.6169664859772, 605.7230832576752, 599.8701512813568, 594.0566577911377, 588.2779381275177, 582.5304937362671, 576.810298204422, 571.1125133037567, 565.4357523918152, 559.7773404121399, 554.1329753398895, 548.5024389028549, 542.8810855150223, 537.2689448595047, 531.661897778511, 526.0601058006287, 520.4636586904526, 514.8661639690399, 509.26664888858795, 503.6646877527237, 498.0585308074951, 492.44687712192535, 486.82909393310547, 481.2070426940918, 475.5778831243515, 469.94325959682465, 464.30456578731537, 458.66007339954376, 453.0090626478195, 447.3523519039154, 441.69191539287567, 436.02467358112335, 430.3555405139923, 424.68491607904434, 419.01096349954605, 413.3357388973236, 407.6574454307556, 401.9782060980797, 396.3000537753105, 390

In [38]:
# list out keys and values separately
key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())

In [39]:
def similarity_cbow(word_1, word_2):

    # test word similarity
    print(word_1)
    print(word_2)
    w1_id = torch.tensor(word_to_ix[word_1], dtype=torch.long)
    w2_id = torch.tensor(word_to_ix[word_2], dtype=torch.long)
    w1_id = w1_id.to(device)
    w2_id = w2_id.to(device)

    word_1_vec = model.get_word_vector(w1_id)
    word_2_vec = model.get_word_vector(w2_id)

    # The norm of a vector (1D-matrix) is the square root of the sum of all the squared values within the vector.
    print(math.sqrt(torch.square(word_1_vec).sum()))
    print(torch.linalg.norm(word_1_vec))
    print(torch.linalg.norm(word_2_vec))
    print(word_1_vec.dot(word_2_vec))

    word_distance = torch.linalg.norm(word_1_vec - word_2_vec)
    print("Distance between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_distance))
    word_similarity = (word_1_vec.dot(word_2_vec) / (torch.linalg.norm(word_1_vec) * torch.linalg.norm(word_2_vec)))
    print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity))


In [40]:
similarity_cbow("neural", "network")

neural
network
1.810610542700516
tensor(1.8106, grad_fn=<LinalgVectorNormBackward0>)
tensor(3.0923, grad_fn=<LinalgVectorNormBackward0>)
tensor(0.4089, grad_fn=<DotBackward0>)
Distance between 'neural' & 'network' : 3.4674
Similarity between 'neural' & 'network' : 0.0730


In [41]:
def predict_middle_word(prev_words, post_words):
    prev_words = prev_words.split()
    post_words = post_words.split()

    input_words= make_context_vector(prev_words + post_words, word_to_ix)
    input_words = input_words.to(device)
    output = model(input_words)
    out_ind = output.argmax(1)
#     print(word_to_ix)
#     out_word = word_to_ix.itos[out_ind.item()]
    out_word = key_list[val_list.index(out_ind.item())]
    print(prev_words, out_word, post_words)

In [42]:
predict_middle_word("a recurrent", "network is")
predict_middle_word("LSTM is", "recurrent neural")

['a', 'recurrent'] neural ['network', 'is']
['LSTM', 'is'] for ['recurrent', 'neural']


### Now that you saw how to create the CBOW model (word2vec), you should work on doing the "opposite" model, Skip-Gram ###

Skip-gram as you saw on the lectures, reverses the problem so you need to predict through the "fake task" the context of the input

In [45]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SkipGram, self).__init__()
        self.embedding_center = nn.Embedding(vocab_size, embed_dim)
        self.embedding_context = nn.Embedding(vocab_size, embed_dim)

    def forward(self, center_word):
        center_embed = self.embedding_center(center_word)
        return center_embed

    def get_word_vector(self, word):
        return self.embedding_center(word)

# Create Skip-gram dataset (opposite of CBOW)
skipgram_data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    center_word = raw_text[i]
    context = []

    # Get context words around center word
    for j in range(CONTEXT_SIZE, 0, -1):
        context.append(raw_text[i - j])
    for j in range(1, CONTEXT_SIZE + 1):
        context.append(raw_text[i + j])

    # For Skip-gram: center word predicts each context word
    for context_word in context:
        skipgram_data.append((center_word, context_word))

print(f"Skip-gram data pairs: {len(skipgram_data)}")
print("First 5 pairs:", skipgram_data[:5])

def train_skipgram(model, epochs, data, optimizer, loss_fn):
    model.train()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for center_word, context_word in data:
            # Prepare inputs
            center_idx = torch.tensor([word_to_ix[center_word]], dtype=torch.long).to(device)
            context_idx = torch.tensor([word_to_ix[context_word]], dtype=torch.long).to(device)

            model.zero_grad()

            # Forward pass
            center_embed = model(center_idx)
            context_embed = model.embedding_context(context_idx)

            # Compute similarity (dot product)
            score = torch.sum(center_embed * context_embed, dim=1)

            # Negative sampling (simplified - just use other random words)
            neg_samples = torch.randint(0, len(word_to_ix), (5,)).to(device)
            neg_embeds = model.embedding_context(neg_samples)
            neg_scores = torch.sum(center_embed * neg_embeds, dim=1)

            # Loss: maximize positive score, minimize negative scores
            pos_loss = -torch.log(torch.sigmoid(score))
            neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_scores)))
            loss = pos_loss + neg_loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        losses.append(total_loss)

    return losses

# Train Skip-gram model
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
HIDDEN_SIZE = 256

skipgram_model = SkipGram(vocab_size, EMBEDD_DIM)
skipgram_model = skipgram_model.to(device)
optimizer = optim.SGD(skipgram_model.parameters(), lr=0.01)

print("Training Skip-gram model...")
skipgram_losses = train_skipgram(skipgram_model, 50, skipgram_data[:1000], optimizer, None)  # Use subset for faster training
print("Skip-gram training completed!")

# Compare embeddings
def compare_embeddings(word1, word2, cbow_model, skipgram_model):
    print(f"\nComparing embeddings for '{word1}' and '{word2}':")

    w1_id = torch.tensor(word_to_ix[word1], dtype=torch.long).to(device)
    w2_id = torch.tensor(word_to_ix[word2], dtype=torch.long).to(device)

    # CBOW embeddings
    cbow_w1 = cbow_model.get_word_vector(w1_id)
    cbow_w2 = cbow_model.get_word_vector(w2_id)
    cbow_sim = torch.dot(cbow_w1.flatten(), cbow_w2.flatten()) / (torch.norm(cbow_w1) * torch.norm(cbow_w2))

    # Skip-gram embeddings
    sg_w1 = skipgram_model.get_word_vector(w1_id)
    sg_w2 = skipgram_model.get_word_vector(w2_id)
    sg_sim = torch.dot(sg_w1.flatten(), sg_w2.flatten()) / (torch.norm(sg_w1) * torch.norm(sg_w2))

    print(f"CBOW similarity: {cbow_sim:.4f}")
    print(f"Skip-gram similarity: {sg_sim:.4f}")

# Compare some word pairs
compare_embeddings("neural", "network", model, skipgram_model)  # model is the CBOW model from notebook
compare_embeddings("LSTM", "RNN", model, skipgram_model)

Skip-gram data pairs: 592
First 5 pairs: [('Memory', 'Long'), ('Memory', 'Short-Term'), ('Memory', '(LSTM)'), ('Memory', 'is'), ('(LSTM)', 'Short-Term')]
Training Skip-gram model...
Skip-gram training completed!

Comparing embeddings for 'neural' and 'network':
CBOW similarity: 0.0730
Skip-gram similarity: 0.5538

Comparing embeddings for 'LSTM' and 'RNN':
CBOW similarity: -0.2257
Skip-gram similarity: 0.5931
