In [5]:
raw_text = "word embeddings are awesome"
tokens = raw_text.split()
print(tokens)

['word', 'embeddings', 'are', 'awesome']


In [6]:
import torch
import torch.nn as nn

embedding = nn.Embedding(num_embeddings=3, embedding_dim=2)

word_indices = torch.tensor([0, 1, 2])  # indices for cat, dog, mouse
word_vectors = embedding(word_indices)

print(word_vectors)


tensor([[-1.0849, -1.3645],
        [-0.0374,  0.9180],
        [-2.2716, -0.0142]], grad_fn=<EmbeddingBackward0>)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define CBOW model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)   #conv context word into vector
        self.linear = nn.Linear(embed_size, vocab_size)      #combined embedding vector into scores(indicate how likely each word is the target word based on the context)
                                                                    #HOW???
    def forward(self, context):
        context_embeds = self.embeddings(context).sum(dim=1)    #adds all context word vectors to get one combined vector
        output = self.linear(context_embeds)    #scores 
        return output

# Sample data and its prep
context_size = 2 #no of words on each side of the target word  -> which is basically used as 2 everywhere
raw_text = "word embeddings are awesome"
tokens = raw_text.split()
vocab = set(tokens)  #unique words only
word_to_index = {word: i for i, word in enumerate(vocab)}  #DICTIONARY is created where key is word and value is its index(i); enumerate->assigns increasing index to each word
data = []
for i in range(2, len(tokens) - 2): #helps skip first and last context_size word as they dont have full context window
    context = [word_to_index[word] for word in tokens[(i-2):i] + tokens[(i + 1):(i + 3)]]  #calc cnotext ke indexes
    target = word_to_index[tokens[i]]  #calc target index
    data.append((torch.tensor(context), torch.tensor(target)))  #indexes are converted into tensors which are then appended to data list



# Hyperparameters: parametersset before the training process begins and control how the model learns.
vocab_size = len(vocab)
embed_size = 10
learning_rate = 0.01   #how big a step we take when updating the model weights using gradients.
epochs = 100

# Initialize CBOW model
cbow_model = CBOWModel(vocab_size, embed_size)
criterion = nn.CrossEntropyLoss()   #to calc loss function
optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)  #optimizer: tool that updates the model’s parameters (weights) to minimize the loss during training.

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        optimizer.zero_grad()    #clearing the previous iteration gradients 
        output = cbow_model(context)   #outputs the scores
        loss = criterion(output.unsqueeze(0), target.unsqueeze(0))  #adds a batch size of 1 as pytorch is expecting output as [batchsize, vocabSize] and target as [batchSize]
        loss.backward()   #calc gradients of the loss  (delta b)
        optimizer.step()   #updates weights wrt the calculated gradients
        total_loss += loss.item()   
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")

# Example usage: Get embedding for a specific word
word_to_lookup = "embeddings"
word_index = word_to_index[word_to_lookup]
embedding = cbow_model.embeddings(torch.tensor([word_index]))
print(f"Embedding for '{word_to_lookup}': {embedding.detach().numpy()}")

In [10]:
nn.Embedding?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mEmbedding[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mnum_embeddings[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding_dim[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding_idx[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_norm[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnorm_type[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m2.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_grad_by_freq[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m_weight[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTe

In [8]:
# ---------------------- IMPORTS ----------------------
import torch
import torch.nn as nn
import torch.optim as optim

# ---------------------- CBOW MODEL ----------------------
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear = nn.Linear(embed_size, vocab_size)

    def forward(self, context):
        # context shape: [num_context_words]
        context_embeds = self.embeddings(context)  # shape: [context_len, embed_size]
        context_sum = context_embeds.sum(dim=0)    # shape: [embed_size]
        output = self.linear(context_sum)          # shape: [vocab_size]
        return output

# ---------------------- DATA PREPARATION ----------------------
context_size = 2  # number of words on each side of target
raw_text = "word embeddings are awesome and word embeddings help models understand text better"
tokens = raw_text.split()
vocab = set(tokens)
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

data = []
for i in range(context_size, len(tokens) - context_size):
    context = [word_to_index[tokens[j]] for j in range(i - context_size, i)] + \
              [word_to_index[tokens[j]] for j in range(i + 1, i + context_size + 1)]
    target = word_to_index[tokens[i]]
    data.append((torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)))

# ---------------------- TRAINING SETUP ----------------------
vocab_size = len(vocab)
embed_size = 10
learning_rate = 0.01
epochs = 100

cbow_model = CBOWModel(vocab_size, embed_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)

# ---------------------- TRAINING LOOP ----------------------
for epoch in range(epochs):
    total_loss = 0
    for context_tensor, target_tensor in data:
        optimizer.zero_grad()
        output = cbow_model(context_tensor)
        loss = criterion(output.unsqueeze(0), target_tensor.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# ---------------------- EXAMPLE USAGE ----------------------
word_to_lookup = "embeddings"
if word_to_lookup in word_to_index:
    word_idx = word_to_index[word_to_lookup]
    embedding_tensor = cbow_model.embeddings(torch.tensor([word_idx]))
    print(f"\nEmbedding for '{word_to_lookup}':\n{embedding_tensor.detach().numpy()}")
else:
    print(f"\nWord '{word_to_lookup}' not found in vocabulary.")


Epoch 1, Loss: 26.1634
Epoch 2, Loss: 23.6594
Epoch 3, Loss: 21.5119
Epoch 4, Loss: 19.6525
Epoch 5, Loss: 18.0336
Epoch 6, Loss: 16.6161
Epoch 7, Loss: 15.3660
Epoch 8, Loss: 14.2551
Epoch 9, Loss: 13.2612
Epoch 10, Loss: 12.3671
Epoch 11, Loss: 11.5592
Epoch 12, Loss: 10.8265
Epoch 13, Loss: 10.1598
Epoch 14, Loss: 9.5511
Epoch 15, Loss: 8.9939
Epoch 16, Loss: 8.4824
Epoch 17, Loss: 8.0119
Epoch 18, Loss: 7.5781
Epoch 19, Loss: 7.1774
Epoch 20, Loss: 6.8069
Epoch 21, Loss: 6.4638
Epoch 22, Loss: 6.1455
Epoch 23, Loss: 5.8501
Epoch 24, Loss: 5.5754
Epoch 25, Loss: 5.3197
Epoch 26, Loss: 5.0814
Epoch 27, Loss: 4.8591
Epoch 28, Loss: 4.6513
Epoch 29, Loss: 4.4570
Epoch 30, Loss: 4.2750
Epoch 31, Loss: 4.1043
Epoch 32, Loss: 3.9441
Epoch 33, Loss: 3.7935
Epoch 34, Loss: 3.6517
Epoch 35, Loss: 3.5182
Epoch 36, Loss: 3.3923
Epoch 37, Loss: 3.2734
Epoch 38, Loss: 3.1610
Epoch 39, Loss: 3.0547
Epoch 40, Loss: 2.9540
Epoch 41, Loss: 2.8586
Epoch 42, Loss: 2.7680
Epoch 43, Loss: 2.6821
Epoch 4

In [9]:
nn.CrossEntropyLoss?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mCrossEntropyLoss[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mweight[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msize_average[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;34m-[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreduce[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreduction[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'mean'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabel_smoothing[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
This criterion computes the cross entropy loss between input logits
and target.

It is useful when trainin

In [10]:
nn.Embedding?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mEmbedding[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mnum_embeddings[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding_dim[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding_idx[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_norm[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnorm_type[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m2.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_grad_by_freq[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m_weight[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTe

In [11]:
nn.Linear?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.

This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

Args:
    in_features: size of each input sample
    out_features: size of each output sample
    bias:

In [12]:
torch.save(cbow_model.state_dict(), "cbow_model.pth")


output is a 10-dimensional dense vector(10 cuz we set embed_size as 10) consisting of embeddings

IMPLEMENTATION OF CBOW USING NUMPY

In [None]:
import numpy as np

raw_text = "word embeddings are awesome and word embeddings help models understand text better"
tokens = raw_text.split()
vocab = list(set(tokens))
word_to_index = {w: idx for idx, w in enumerate(vocab)}
index_to_word = {idx: w for w, idx in word_to_index.items()}
vocab_size = len(vocab)
embed_size = 10
context_window = 2

d=[]
for i in range(2,len(tokens)-2):
    context=[tokens[i-2],tokens[i-1],tokens[i+1],tokens[i+2]]
    target=tokens[i]
    d.append(([word_to_index[w] for w in context],word_to_index[target]))

W1 = np.random.rand(vocab_size, embed_size) #for lookup table/matrix->embeddings
W2 = np.random.rand(embed_size, vocab_size)  #output weights

def softmax(x):
    e_x = np.exp(x - np.max(x))  
    return e_x / np.sum(e_x)

def train(d, epochs, lr):
    global W1, W2
    for epochs in range(epochs):
        loss_total=0
        for c,t in data:
            # Step 1: Forward pass
            x = np.zeros(vocab_size)
            for idx in context_ids:
                x[idx] += 1
            x = x / len(context_ids)  #averaged one hot vector

            h = np.dot(W1.T, x)               # hidden layer
            u = np.dot(W2.T, h)               # output layer
            y_pred = softmax(u)

            # Step 2: Compute loss (cross-entropy)
            loss = -np.log(y_pred[target_id])
            loss_total += loss

            # Step 3: Backpropagation
            # One-hot for target
            y_true = np.zeros(vocab_size)
            y_true[target_id] = 1

            e = y_pred - y_true

            dW2 = np.outer(h, e)
            dW1 = np.outer(x, np.dot(W2, e))

            # Step 4: Update weights
            W1 -= lr * dW1
            W2 -= lr * dW2

        print(f"Epoch {epoch+1}, Loss: {loss_total:.4f}")

# Train the model
train(d, epochs=100, lr=0.05)


def get_embedding(word):
    idx = word_to_index[word]
    return W1[idx]

word = "embeddings"
if word in word_to_index:
    print(f"\nEmbedding for '{word}':\n{get_embedding(word)}")
else:
    print(f"\n'{word}' not found in vocabulary.")


In [None]:
import numpy as np

raw_text = "word embeddings are awesome and word embeddings help models understand text better"
tokens = raw_text.split()
vocab = list(set(tokens))
word_to_index = {w: idx for idx, w in enumerate(vocab)}
index_to_word = {idx: w for w, idx in word_to_index.items()}
vocab_size = len(vocab)
embed_size = 10
context_window = 2

d=[]
for i in range(2,len(tokens)-2):
    context=[tokens[i-2],tokens[i-1],tokens[i+1],tokens[i+2]]
    target=tokens[i]
    d.append(([word_to_index[w] for w in context],word_to_index[target]))

W1 = np.random.rand(vocab_size, embed_size) #for lookup table/matrix
W2 = np.random.rand(embed_size, vocab_size)  #output weights

def softmax(x):
    e_x = np.exp(x - np.max(x))  
    return e_x / np.sum(e_x)

def train(d, epochs, lr):
    global W1, W2
    for epochs in range(epochs):
        loss_total=0
        for c,t in data:
            # Step 1: Forward pass
            x = np.zeros(vocab_size)
            for idx in context_ids:
                x[idx] += 1
            x = x / len(context_ids)  #averaged one hot vector

            h = np.dot(W1.T, x)               # hidden layer
            u = np.dot(W2.T, h)               # output layer
            y_pred = softmax(u)

            # Step 2: Compute loss (cross-entropy)
            loss = -np.log(y_pred[target_id])
            loss_total += loss

            # Step 3: Backpropagation
            # One-hot for target
            y_true = np.zeros(vocab_size)
            y_true[target_id] = 1

            e = y_pred - y_true

            dW2 = np.outer(h, e)
            dW1 = np.outer(x, np.dot(W2, e))

            # Step 4: Update weights
            W1 -= lr * dW1
            W2 -= lr * dW2

        print(f"Epoch {epoch+1}, Loss: {loss_total:.4f}")

# Train the model
train(data, epochs=100, lr=0.05)

# ------------------ EXAMPLE USAGE ------------------
def get_embedding(word):
    idx = word_to_index[word]
    return W1[idx]

word = "embeddings"
if word in word_to_index:
    print(f"\nEmbedding for '{word}':\n{get_embedding(word)}")
else:
    print(f"\n'{word}' not found in vocabulary.")
