In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

#**Gen our dataset**

In [2]:
corpus = "INeuron company is Related to Data Science".lower().split()

In [3]:
corpus

['ineuron', 'company', 'is', 'related', 'to', 'data', 'science']

In [4]:
word_to_idx = {w : idx for idx,w in enumerate(corpus)}
idx_to_word = {idx : w for idx,w in enumerate(corpus) }

In [5]:
word_to_idx

{'ineuron': 0,
 'company': 1,
 'is': 2,
 'related': 3,
 'to': 4,
 'data': 5,
 'science': 6}

In [6]:
def gen_data_set(corpus, word_to_idx, window_size):
    data = []
    for i in range(window_size, len(corpus) - window_size):
        context_words = (
            corpus[i - window_size : i] +
            corpus[i + 1 : i + window_size + 1]
        )
        context_indices = [word_to_idx[w] for w in context_words]
        target = word_to_idx[corpus[i]]
        data.append((context_indices, target))
    return data

In [7]:
data = gen_data_set(corpus,word_to_idx,2)

In [8]:
data

[([0, 1, 3, 4], 2), ([1, 2, 4, 5], 3), ([2, 3, 5, 6], 4)]

#**Define the class and do the training**

In [9]:
class CBOW(nn.Module):
  def __init__(self,input_size,emb_size):
    super().__init__()
    self.embeddings = nn.Embedding(input_size,emb_size)
    self.linear = nn.Linear(emb_size,input_size)
  def forward(self,x):
    embeds = self.embeddings(x)
    hidden = embeds.mean(dim=0)
    out = self.linear(hidden)
    return out

In [10]:
model = CBOW(len(corpus),5)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [11]:
for i in range(100):
  total_loss = 0.0
  for val,target in data:
    val_ten = torch.tensor(val)
    target_ten = torch.tensor([target])
    optimizer.zero_grad()
    predict = model(val_ten)
    loss = loss_function(predict.unsqueeze(0),target_ten)
    loss.backward()
    optimizer.step()
    total_loss+=loss.item()
  if(i%10 == 0):
    print(f"Epoch{i},Loss = {total_loss:.4f}")

Epoch0,Loss = 6.2689
Epoch10,Loss = 4.1490
Epoch20,Loss = 2.8772
Epoch30,Loss = 2.0722
Epoch40,Loss = 1.4881
Epoch50,Loss = 1.0101
Epoch60,Loss = 0.6426
Epoch70,Loss = 0.4038
Epoch80,Loss = 0.2641
Epoch90,Loss = 0.1828


In [12]:
embeddings = model.embeddings.weight.data

In [13]:
print("Learned Word Embeddings:\n")
for word, idx in word_to_idx.items():
    print(word, ":", embeddings[idx])

Learned Word Embeddings:

ineuron : tensor([ 2.8459,  2.9432,  0.9951,  0.4164, -1.9972])
company : tensor([-1.6294,  0.9243,  2.1285, -1.5257, -0.6085])
is : tensor([-1.2511, -0.9987, -1.0873, -1.7980,  2.1815])
related : tensor([3.0028, 1.8232, 2.1177, 0.0673, 1.9225])
to : tensor([-1.7657, -0.2726,  2.2189, -0.3275, -0.9156])
data : tensor([-0.9311, -0.9148,  1.1015, -0.9351,  3.0657])
science : tensor([ 0.7389,  3.3504, -0.4816, -0.5499,  1.4964])


In [14]:
def predict_middle_word(context_words):
    model.eval()
    context_indices = [word_to_idx[w.lower()] for w in context_words]
    context_tensor = torch.tensor(context_indices)

    with torch.no_grad():
        output = model(context_tensor)
    print(f"Debug {output} values!")
    predicted_idx = torch.argmax(output).item()
    predicted_word = idx_to_word[predicted_idx]

    return predicted_word

context_for_is = ["ineuron", "company", "related", "to"]
predicted_word = predict_middle_word(context_for_is)
print(f"Context words: {context_for_is}, Predicted middle word: {predicted_word}")

context_for_related = ["company", "is", "to", "data"]
predicted_word_2 = predict_middle_word(context_for_related)
print(f"Context words: {context_for_related}, Predicted middle word: {predicted_word_2}")

context_for_to = ["is", "related", "data", "science"]
predicted_word_3 = predict_middle_word(context_for_to)
print(f"Context words: {context_for_to}, Predicted middle word: {predicted_word_3}")

Debug tensor([-3.8604, -2.7221,  3.7641, -2.3940,  0.4149, -3.5847, -3.4685]) values!
Context words: ['ineuron', 'company', 'related', 'to'], Predicted middle word: is
Debug tensor([-4.3911, -3.4600, -3.1311,  2.7318, -0.9193, -5.1797, -4.3079]) values!
Context words: ['company', 'is', 'to', 'data'], Predicted middle word: related
Debug tensor([-4.3441, -4.9946, -0.8692, -1.2209,  2.4806, -4.0717, -4.0196]) values!
Context words: ['is', 'related', 'data', 'science'], Predicted middle word: to
