In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# CBOW Model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # Average the embeddings of the context words
        embedded = self.embeddings(context).mean(dim=1)
        output = self.linear(embedded)
        return output

In [2]:
class CBOWDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_indices = torch.tensor([self.vocab.word2idx[word] for word in context], dtype=torch.long)
        target_index = torch.tensor(self.vocab.word2idx[target], dtype=torch.long)
        return context_indices, target_index

In [3]:
def train_cbow(model, dataloader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        total_loss = 0
        for context, target in dataloader:
            optimizer.zero_grad()
            output = model(context)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [4]:
def preprocess_corpus(file_path, window_size=2):
    """
    Preprocess the text corpus to generate context-target pairs for CBOW.
    Args:
        file_path (str): Path to the text file.
        window_size (int): Number of context words on each side of the target word.
    Returns:
        list: A list of (context, target) pairs.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        words = f.read().strip().split()[:200000]  # Split the text into words
        

    data = []
    for i in range(window_size, len(words) - window_size):
        context = words[i - window_size:i] + words[i + 1:i + window_size + 1]
        target = words[i]
        data.append((context, target))
    return data

# Example usage
file_path = "text8.txt"
window_size = 3
data = preprocess_corpus(file_path, window_size)
print(f"Sample data: {data[:5]}")

Sample data: [(['anarchism', 'originated', 'as', 'term', 'of', 'abuse'], 'a'), (['originated', 'as', 'a', 'of', 'abuse', 'first'], 'term'), (['as', 'a', 'term', 'abuse', 'first', 'used'], 'of'), (['a', 'term', 'of', 'first', 'used', 'against'], 'abuse'), (['term', 'of', 'abuse', 'used', 'against', 'early'], 'first')]


In [5]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def build_vocab(self, corpus):
        for word in corpus:
            if word not in self.word2idx:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)

# Build vocabulary from the corpus
vocab = Vocabulary()
vocab.build_vocab([word for context, target in data for word in context + [target]])
print(f"Vocabulary size: {len(vocab.word2idx)}")

Vocabulary size: 19062


In [6]:
print(f"Sample vocabulary: {list(vocab.word2idx.items())[:20]}")

Sample vocabulary: [('anarchism', 0), ('originated', 1), ('as', 2), ('term', 3), ('of', 4), ('abuse', 5), ('a', 6), ('first', 7), ('used', 8), ('against', 9), ('early', 10), ('working', 11), ('class', 12), ('radicals', 13), ('including', 14), ('the', 15), ('diggers', 16), ('english', 17), ('revolution', 18), ('and', 19)]


In [7]:
# Create the dataset
dataset = CBOWDataset(data, vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [8]:
# Hyperparameters
embedding_dim = 128
epochs = 10

# Initialize the model, loss function, and optimizer
model = CBOW(len(vocab.word2idx), embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
train_cbow(model, dataloader, criterion, optimizer, epochs)

Epoch 1, Loss: 45070.7111
Epoch 2, Loss: 31505.8954
Epoch 3, Loss: 22748.1758
Epoch 4, Loss: 17917.5155
Epoch 5, Loss: 15470.3418
Epoch 6, Loss: 13952.4802
Epoch 7, Loss: 12993.0047
Epoch 8, Loss: 12226.2891
Epoch 9, Loss: 11671.6757
Epoch 10, Loss: 11209.1986


In [15]:
def get_embedding(word):
    idx = vocab.word2idx.get(word, vocab.word2idx.get("<UNK>"))  # Handle unknown words
    return model.embeddings.weight[idx]

def find_most_similar(word, top_n=5):
    """
    Find the top_n most similar words to the input word based on cosine similarity.
    Args:
        word (str): The input word.
        top_n (int): Number of most similar words to return.
    """
    input_embedding = get_embedding(word)
    all_embeddings = model.embeddings.weight
    cos = nn.CosineSimilarity(dim=1)
    similarities = cos(all_embeddings, input_embedding.unsqueeze(0))  # Compute similarities
    top_indices = similarities.argsort(descending=True)[:top_n + 1]  # +1 to exclude the word itself

    print(f"Most similar words to '{word}':")
    for idx in top_indices:
        similar_word = vocab.idx2word[idx.item()]
        if similar_word != word:  # Exclude the word itself
            print(f"{similar_word}: {similarities[idx].item():.4f}")

# Example usage
find_most_similar("run", top_n=15)

Most similar words to 'run':
vermeer: 0.3750
correspond: 0.3543
sake: 0.3511
wounds: 0.3493
genuine: 0.3446
theaters: 0.3357
diminish: 0.3269
decree: 0.3172
barnard: 0.3123
transected: 0.3117
wealth: 0.3079
antipathy: 0.3073
barbed: 0.3065
peers: 0.3061
theoretical: 0.3024


In [10]:
def get_embedding(word):
    idx = vocab.word2idx[word]
    return model.embeddings.weight[idx]

cos = nn.CosineSimilarity(dim=0)
similarity = cos(get_embedding("chaos"), get_embedding("anarchy"))
print(f"Similarity between king and anarchy: {similarity.item():.4f}")


Similarity between king and anarchy: 0.0081
