In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np

# Sample corpus
corpus = [
    "we are learning word2vec",
    "word2vec is fun to learn",
    "we learn from examples",
    "examples make learning word2vec easier"
]

# Hyperparameters
window_size = 2
embedding_dim = 50
epochs = 50
learning_rate = 0.001

# Preprocessing
def tokenize_corpus(corpus):
    tokenized_corpus = [sentence.split() for sentence in corpus]
    vocab = {word for sentence in tokenized_corpus for word in sentence}
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for word, i in word_to_idx.items()}
    return tokenized_corpus, word_to_idx, idx_to_word

def generate_context_target_pairs(tokenized_corpus, window_size):
    context_target_pairs = []
    for sentence in tokenized_corpus:
        for i, word in enumerate(sentence):
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if i != j:
                    context_target_pairs.append((word, sentence[j]))
    return context_target_pairs

tokenized_corpus, word_to_idx, idx_to_word = tokenize_corpus(corpus)
context_target_pairs = generate_context_target_pairs(tokenized_corpus, window_size)

# Dataset and DataLoader
class Word2VecDataset(Dataset):
    def __init__(self, context_target_pairs, word_to_idx):
        self.pairs = context_target_pairs
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        context_word, target_word = self.pairs[index]
        return self.word_to_idx[context_word], self.word_to_idx[target_word]

dataset = Word2VecDataset(context_target_pairs, word_to_idx)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Word2Vec Model
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_word):
        embeds = self.embeddings(context_word)
        out = self.linear(embeds)
        return out

# Initialize model, loss, and optimizer
vocab_size = len(word_to_idx)
model = Word2Vec(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for context_word, target_word in dataloader:
        context_word, target_word = context_word.long(), target_word.long()
        optimizer.zero_grad()
        output = model(context_word)
        loss = criterion(output, target_word)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Extract embeddings
word_embeddings = model.embeddings.weight.data.numpy()
