In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [None]:
# Sample data - you should replace this with your own dataset
corpus = [
    "i like deep learning",
    "deep learning is fun",
    "machine learning is interesting",
]

# Tokenize the corpus
corpus = [sentence.split() for sentence in corpus]

# Create a vocabulary
vocab = set(word for sentence in corpus for word in sentence)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)

In [None]:
# CBOW context window size
context_size = 2

# Create training data
data = []
for sentence in corpus:
    for i in range(context_size, len(sentence) - context_size):
        context = [sentence[i - j] for j in range(context_size)] + [sentence[i + j] for j in range(1, context_size + 1)]
        target = sentence[i]
        data.append((context, target))

In [None]:
# Define a CBOW dataset
class CBOWDataset(Dataset):
    def __init__(self, data, word_to_idx, context_size):
        self.data = data
        self.word_to_idx = word_to_idx
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_indices = [self.word_to_idx[word] for word in context]
        target_index = self.word_to_idx[target]
        return (context_indices, target_index)

In [None]:
# Create a DataLoader
dataset = CBOWDataset(data, word_to_idx, context_size)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Define the CBOW model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context).sum(dim=0)
        scores = self.linear(embedded)
        return scores

In [None]:
# Hyperparameters
embedding_dim = 100
learning_rate = 0.001
num_epochs = 100

# Create and train the CBOW model
model = CBOWModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for context_indices, target_index in dataloader:
        optimizer.zero_grad()
        context_indices = context_indices[0]
        target_index = target_index[0]
        scores = model(context_indices)
        loss = criterion(scores, target_index)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')

# Retrieve word embeddings from the model
word_embeddings = model.embeddings.weight.data.numpy()