In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import torch.nn as nn


In [2]:
# Sample corpus (expanded)
corpus = [
    "It is a pleasant day",
    "It is a sunny day",
    "The weather is nice today",
    "I love sunny days",
    "What a beautiful day",
    "The day is bright and sunny",
    "Today is a wonderful day",
    "I enjoy beautiful weather",
    "Sunny weather makes me happy",
    "I like pleasant days",
    "A sunny day brings joy",
    "What a nice day it is",
    "Sunny days are beautiful"
]

In [3]:
# Preprocessing function
def preprocess(corpus):
    words = []
    for sentence in corpus:
        words += sentence.lower().split()
    return words

words = preprocess(corpus)
word_count = Counter(words)
vocab = sorted(word_count.keys())
word_to_index = {word: i for i, word in enumerate(vocab)}

In [4]:

# Hyperparameters
context_size = 2  # Number of context words
embedding_dim = 20  # Increased dimension
hidden_dim = 50    # Added hidden layer dimension

In [5]:
# Dataset class
class CBOWDataset(Dataset):
    def __init__(self, text, context_size):
        self.data = []
        self.word_to_index = word_to_index
        self.index_to_word = {i: word for i, word in enumerate(vocab)}
        self.build_dataset(text, context_size)

    def build_dataset(self, text, context_size):
        for sentence in text:
            words = sentence.lower().split()
            for i, target_word in enumerate(words):
                context_words = []
                for j in range(-context_size, context_size + 1):
                    if j != 0 and 0 <= i + j < len(words):
                        context_words.append(words[i + j])
                if len(context_words) == context_size * 2:
                    self.data.append((context_words, target_word))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context_words, target_word = self.data[idx]
        context_indices = torch.tensor([self.word_to_index[w] for w in context_words], dtype=torch.long)
        target_index = torch.tensor(self.word_to_index[target_word], dtype=torch.long)
        return context_indices, target_index

In [6]:

# Create the dataset and data loader
dataset = CBOWDataset(corpus, context_size)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [7]:
# CBOW model class with an additional hidden layer
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context):
        embeds = self.embeddings(context)
        averaged_embeds = embeds.mean(dim=1)
        hidden = torch.relu(self.linear1(averaged_embeds))
        output = self.linear2(hidden)
        return output

In [8]:

# Initialize model, loss function, and optimizer
model = CBOW(len(vocab), embedding_dim, hidden_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [9]:

# Training loop
num_epochs = 200  # Increased epochs
for epoch in range(num_epochs):
    total_loss = 0
    for context, target in dataloader:
        optimizer.zero_grad()
        output = model(context)
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

Epoch 0, Loss: 15.5630
Epoch 20, Loss: 0.0291
Epoch 40, Loss: 0.0086
Epoch 60, Loss: 0.0042
Epoch 80, Loss: 0.0026
Epoch 100, Loss: 0.0017
Epoch 120, Loss: 0.0012
Epoch 140, Loss: 0.0009
Epoch 160, Loss: 0.0007
Epoch 180, Loss: 0.0006


In [10]:

# Prediction function
def predict_word(model, context_words, word_to_index):
    context_indices = torch.tensor([word_to_index[word] for word in context_words], dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        output = model(context_indices)
        predicted_index = output.argmax(dim=1).item()
    return dataset.index_to_word[predicted_index]

In [11]:

# Example context to predict the target word
context = ['it', 'is', 'a', 'pleasant']  # Adjusted context
predicted_word = predict_word(model, context, word_to_index)
print(f'Predicted word: {predicted_word}')

Predicted word: day
