# Word Embeddings

Continuous bag of words (CBOW) model used to predict a target word given the surrounding words. Additionally, a pre-trained GloVe embedding model is used to predict target words.

Load packages

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List
from collections import Counter

# build_vocab Helper Function

In [2]:
def build_vocab(words: List[str]):
    vocab = Counter()
    for w in words:
        vocab[w] += 1
    return vocab

# Prepare Data

Data consists of a list of tuples with two elements each, a list of context words and the target word. Both the context words and the target word are represented by word indices.

In [6]:
CONTEXT_SIZE = 3  # Define the context size -- CONTEXT_SIZE words to the left and right of the target word

raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

vocab = build_vocab(raw_text)

vocab_size = len(vocab)

print ('vocab_size',vocab_size)

word_to_idx = {word: i for i, word in enumerate(vocab)}#index for each word in vocab
idx_to_word = list(vocab)#list of words in the vocab

word_indices = [word_to_idx[w] for w in raw_text]#Every word in the raw text has an index

# Creates a data sample with CONTEXT_SIZE * 2 context words and the target word between them
def prepare_data(word_indices):
    data = []
    for i in range(CONTEXT_SIZE, len(word_indices) - CONTEXT_SIZE):
        target = word_indices[i]

        # Elements to the left of the target word
        left_context = [word_indices[i - j - 1] for j in range(CONTEXT_SIZE)]
        # Reverse left side list to keep words in correct order
        left_context = left_context[::-1]

        # Elements to the right of the target word
        right_context = [word_indices[i + j + 1] for j in range(CONTEXT_SIZE)]

        # Full context
        context = left_context + right_context

        data.append((context, target))

    # for d in data:
    #     print(d)
    return data

# Test data preparation
data = prepare_data(word_indices)
print ('length of data', len(data))
print('data[0]:', data[0])
ctx, tgt = data[0]
print('context words:', [idx_to_word[c] for c in ctx])
print('target word:', idx_to_word[tgt])

vocab_size 49
length of data 56
data[0]: ([0, 1, 2, 4, 5, 6], 3)
context words: ['We', 'are', 'about', 'study', 'the', 'idea']
target word: to


# Implement the CBOW Model

In [9]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, no_of_samples, embed_weights):
        super(CBOW, self).__init__()

        # Create the embedding matrix
        if embed_weights is not None:
            self.embeds = nn.Embedding.from_pretrained(embeddings=embed_weights, freeze=False)
        else:
            self.embeds = nn.Embedding(vocab_size, embedding_dim)

        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.activation = nn.LogSoftmax(dim = -1)

    def forward(self, inputs):
        embeds = self.embeds(inputs)
        embeds = embeds.sum(dim=0).unsqueeze(0)
        output = self.linear(embeds)
        output = self.activation(output)

        return output

# Test model definition
torch.manual_seed(0)

m = CBOW(vocab_size=10, embedding_dim=20, no_of_samples=3, embed_weights=None)
test_input = torch.tensor([1,2,3], dtype=torch.long)

test_output = m(test_input)

print('test_output.shape', test_output.shape)
print('test_output', test_output.data)

test_output.shape torch.Size([1, 10])
test_output tensor([[-1.6878, -4.2108, -5.0252, -2.9802, -3.1362, -1.5436, -1.4120, -3.2485,
         -1.6490, -4.5009]])


Expected Output

test_output.shape torch.Size([1, 10])<br>
test_output tensor([[-1.6878, -4.2108, -5.0252, -2.9802, -3.1362, -1.5436, -1.4120, -3.2485,
         -1.6490, -4.5009]])

# Training Loop

In [11]:
torch.manual_seed(0)

EMBEDDING_DIM = 50
model = CBOW(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE*2, embed_weights=None)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer.zero_grad()

# Training loop
for epoch in range(1000):
    total_loss = 0
    for ctx, tgt in data:
        ctx_tensor = torch.tensor(ctx, dtype=torch.long)
        tgt_tensor = torch.tensor([tgt], dtype=torch.long)

        output = model(ctx_tensor)

        total_loss += loss_function(output, tgt_tensor)

    # Optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    # Print training information
    if epoch % 5 == 0 and epoch > 0:
        print(f'Loss within epoch {epoch}: ', total_loss.item())

Loss within epoch 5:  182.17068481445312
Loss within epoch 10:  128.0272674560547
Loss within epoch 15:  93.20354461669922
Loss within epoch 20:  69.58182525634766
Loss within epoch 25:  53.30769729614258
Loss within epoch 30:  41.98340606689453
Loss within epoch 35:  33.983062744140625
Loss within epoch 40:  28.213905334472656
Loss within epoch 45:  23.947219848632812
Loss within epoch 50:  20.70659828186035
Loss within epoch 55:  18.183000564575195
Loss within epoch 60:  16.173580169677734
Loss within epoch 65:  14.542234420776367
Loss within epoch 70:  13.195314407348633
Loss within epoch 75:  12.06682014465332
Loss within epoch 80:  11.109159469604492
Loss within epoch 85:  10.287294387817383
Loss within epoch 90:  9.574958801269531
Loss within epoch 95:  8.952115058898926
Loss within epoch 100:  8.403236389160156
Loss within epoch 105:  7.916140556335449
Loss within epoch 110:  7.481140613555908
Loss within epoch 115:  7.090439796447754
Loss within epoch 120:  6.737703323364258
Lo

# Helper Function to Get Predicted Word

In [13]:
def get_predicted_word(model_output, idx_to_word):
    word = idx_to_word[torch.argmax(model_output)]

    return word

# Test 1
ctx_words = 'processes manipulate other things called data.'.split()
ctx_indices = [word_to_idx[w] for w in ctx_words]
ctx_tensor = torch.tensor(ctx_indices, dtype=torch.long)

out = model(ctx_tensor)
pred = get_predicted_word(out, idx_to_word)
print(f'The predicted word is: \"{pred}\"')

# Test 2
ctx_words = 'we conjure the of the computer'.split()
ctx_indices = [word_to_idx[w] for w in ctx_words]
ctx_tensor = torch.tensor(ctx_indices, dtype=torch.long)

out = model(ctx_tensor)
pred = get_predicted_word(out, idx_to_word)
print(f'The predicted word is: \"{pred}\"')

The predicted word is: "abstract"
The predicted word is: "spirits"


# Use Pretrained GloVe Embeddings to Perform Target Word Prediction

Load GloVe embeddings

In [15]:
def load_glove_embeddings(embedding_file):
    g_embeddings = {}
    g_vocab = []
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            g_vocab.append(word)
            vector = torch.tensor([float(val) for val in values[1:]])
            g_embeddings[word] = vector
    return g_embeddings, g_vocab

# Load pretrained GloVe embeddings and extract vocabulary
glove_file = 'glove.6B.50d.txt'
g_embeddings, g_vocab = load_glove_embeddings(glove_file)

# Test to make sure the GloVe embeddings loaded correctly
print ('g_vocab_size', len(g_vocab))

g_word_to_idx = {word: i for i, word in enumerate(g_vocab)}
g_idx_to_word = list(g_vocab)

g_vocab_size, g_embedding_dim = len (g_vocab),50

g_embedding_matrix = torch.stack(list(g_embeddings.values()))
print ('g_embedding_matrix.shape',g_embedding_matrix.shape)

# Create PyTorch embedding layer
g_embedding_layer = nn.Embedding.from_pretrained(g_embedding_matrix)
word = 'the'
g_word_index = g_vocab.index(word)

print ('g_word_index', g_word_index)

g_vocab_size 400001
g_embedding_matrix.shape torch.Size([400001, 50])
g_word_index 0


Expected Output

g_vocab_size 400001<br>
g_embedding_matrix.shape torch.Size([400001, 50])<br>
g_word_index 0

# Calculate Cosine Similarity for Sample Words

In [16]:
word_king = 'king'
word_man ='man'
word_woman = 'woman'
word_queen = 'queen'


king_embedding = g_embedding_layer(torch.tensor([g_vocab.index(word_king)]))
man_embedding = g_embedding_layer(torch.tensor([g_vocab.index(word_man)]))
woman_embedding = g_embedding_layer(torch.tensor([g_vocab.index(word_woman)]))
queen_embedding = g_embedding_layer(torch.tensor([g_vocab.index(word_queen)]))

word_embedding_case_1 = king_embedding - man_embedding + woman_embedding

# Obtain the cosine similarity between king-man+woman to queen in "cosine_sim1"
king_minus_man_plus_woman = king_embedding - man_embedding + woman_embedding
cosine_sim1 = F.cosine_similarity(king_minus_man_plus_woman, queen_embedding)

#Obtain the cosine similarity between king and queen in "cosine_sim2"
cosine_sim2 = F.cosine_similarity(king_embedding, queen_embedding)

print("Cosine sim1:", cosine_sim1.item())
print("Cosine sim2:", cosine_sim2.item())

Cosine sim1: 0.8609580993652344
Cosine sim2: 0.7839043736457825


# Create New Embedding Matrix for 'raw_text' Vocabulary

In [17]:
import re

def preprocess_word(word):
    # Convert word to lowercase
    word = word.lower()

    # Remove punctuation
    word = re.sub(r'[^\w\s]', '', word)
    return word

n_embedding_matrix = torch.zeros(len(vocab), g_embedding_matrix.shape[1])
print ('g', g_embedding_matrix.shape)
print ('n', n_embedding_matrix.shape)

new_vocab_indices = {}
for i, word in enumerate (vocab):
    word=preprocess_word(word)
    if word in g_vocab:
        index = g_vocab.index(word)
        n_embedding_matrix[i] = torch.tensor(g_embedding_matrix[index], dtype=torch.float)
    else:
        print ('This word not in GloVe',word)

g torch.Size([400001, 50])
n torch.Size([49, 50])


  n_embedding_matrix[i] = torch.tensor(g_embedding_matrix[index], dtype=torch.float)


# Training Loop with Pretrained GloVe Embeddings

In [18]:
torch.manual_seed(0)
EMBEDDING_DIM = 50

model2 = CBOW(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE*2, n_embedding_matrix)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model2.parameters(), lr=0.001)
optimizer.zero_grad()

# Training loop
for epoch in range(1000):
    total_loss = 0

    for ctx, tgt in data:
        ctx_tensor = torch.tensor(ctx, dtype=torch.long)
        tgt_tensor = torch.tensor([tgt], dtype=torch.long)

        output = model2(ctx_tensor)

        total_loss += loss_function(output, tgt_tensor)

    # Optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    # Print training information
    if epoch % 5 == 0 and epoch > 0:
        print(f'Loss within epoch {epoch}: ', total_loss.item())

Loss within epoch 5:  201.59201049804688
Loss within epoch 10:  178.20095825195312
Loss within epoch 15:  161.12486267089844
Loss within epoch 20:  146.86415100097656
Loss within epoch 25:  134.70684814453125
Loss within epoch 30:  124.19705963134766
Loss within epoch 35:  115.00505828857422
Loss within epoch 40:  106.8845443725586
Loss within epoch 45:  99.64949035644531
Loss within epoch 50:  93.15776062011719
Loss within epoch 55:  87.29913330078125
Loss within epoch 60:  81.98625946044922
Loss within epoch 65:  77.14881134033203
Loss within epoch 70:  72.72892761230469
Loss within epoch 75:  68.67851257324219
Loss within epoch 80:  64.95677947998047
Loss within epoch 85:  61.5289306640625
Loss within epoch 90:  58.36494827270508
Loss within epoch 95:  55.4387321472168
Loss within epoch 100:  52.7274169921875
Loss within epoch 105:  50.210914611816406
Loss within epoch 110:  47.87141418457031
Loss within epoch 115:  45.693145751953125
Loss within epoch 120:  43.66200637817383
Loss w

# Use New GloVe Embeddings to Get Predicted Word

In [20]:
ctx_words = 'we conjure the of the computer'.split()
ctx_indices = [word_to_idx[w] for w in ctx_words]
ctx_tensor = torch.tensor(ctx_indices, dtype=torch.long)

out = model2(ctx_tensor)
pred = get_predicted_word(out, idx_to_word)
print(f'The predicted word is: \"{pred}\"')

The predicted word is: "spirits"
