# CBOW dataset
 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import random
import pandas as pd
from tqdm import tqdm 
from gensim.models import Word2Vec

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
# Count total tokens in the entire dataset
total_tokens = sum(len(sentence) for sentence in sentences)

print(f"Total number of tokens in the dataset: {total_tokens}")


Total number of tokens in the dataset: 500000


In [6]:
def load_and_preprocess_data(file_path, max_tokens=20000):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Split text into words
    words = text.split()

    # Limit to the first 2000 tokens
    words = words[:max_tokens]

    # Convert list of words into sequences of length `context_size`
    sequence_length = 20  # Sequence length could be smaller for CBOW
    sequences = [words[i:i + sequence_length] for i in range(0, len(words), sequence_length)]

    return sequences

# Load the data
sentences = load_and_preprocess_data('text8', max_tokens=500000)


In [8]:
class CBOWDataset(Dataset):
    def __init__(self, sentences, min_count=5, context_size=2):
        self.sentences = sentences
        self.context_size = context_size
        self.vocab = self.build_vocab(sentences, min_count)
        self.word_to_idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx_to_word = {i: word for i, word in enumerate(self.vocab)}
        self.data = []

        for sentence in sentences:
            for i in range(context_size, len(sentence) - context_size):
                context = (
                    sentence[i - context_size:i] + sentence[i + 1:i + context_size + 1]
                )
                target = sentence[i]
                
                if all(word in self.word_to_idx for word in context + [target]):
                    self.data.append((context, target))
        
        # Debugging statement to check samples
        print(f"Generated {len(self.data)} samples from {len(sentences)} sentences")

    def build_vocab(self, sentences, min_count):
        word_counts = Counter([word for sentence in sentences for word in sentence])
        return [word for word, count in word_counts.items() if count >= min_count]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_idxs = torch.tensor([self.word_to_idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor(self.word_to_idx[target], dtype=torch.long)
        return context_idxs, target_idx


In [9]:
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        # Inputs are context word indices
        embeds = self.embeddings(inputs)
        context_embed = torch.mean(embeds, dim=1)  # Mean of context embeddings
        output = self.linear(context_embed)
        return output


In [10]:
def train_model(dataset, embedding_dim=512, batch_size=32, num_epochs=25, learning_rate=0.005):
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (context_words, target_word) in tqdm(enumerate(dataloader), total=len(dataloader)):
            context_words, target_word = context_words.to(device), target_word.to(device)
            optimizer.zero_grad()
            log_probs = model(context_words)
            loss = criterion(log_probs, target_word)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")
    
    return model


In [36]:
from sklearn.model_selection import train_test_split

def train_cbow_model(dataset, embedding_dim=50, batch_size=32, num_epochs=30, learning_rate=0.0005, val_split=0.2):
    # Split the dataset into training and validation sets
    train_size = int(len(dataset) * (1 - val_split))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model setup
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0
        
        # Training loop
        for batch_idx, (context, target) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            log_probs = model(context)
            loss = criterion(log_probs, target)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        with torch.no_grad():  # Disable gradient calculation for validation
            for context, target in val_dataloader:
                context, target = context.to(device), target.to(device)
                log_probs = model(context)
                loss = criterion(log_probs, target)
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_dataloader)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    return model

# Train the model with dataset split into training and validation sets
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=5)
model = train_cbow_model(cbow_dataset)


Generated 117866 samples from 25000 sentences


100%|██████████| 2947/2947 [00:04<00:00, 662.40it/s]


Epoch 1/30, Train Loss: 7.3288, Val Loss: 6.8473


100%|██████████| 2947/2947 [00:04<00:00, 626.87it/s]


Epoch 2/30, Train Loss: 6.5710, Val Loss: 6.7208


100%|██████████| 2947/2947 [00:04<00:00, 614.35it/s]


Epoch 3/30, Train Loss: 6.3804, Val Loss: 6.6615


100%|██████████| 2947/2947 [00:04<00:00, 609.49it/s]


Epoch 4/30, Train Loss: 6.2359, Val Loss: 6.6298


100%|██████████| 2947/2947 [00:04<00:00, 596.42it/s]


Epoch 5/30, Train Loss: 6.1112, Val Loss: 6.6068


100%|██████████| 2947/2947 [00:04<00:00, 603.45it/s]


Epoch 6/30, Train Loss: 5.9986, Val Loss: 6.5915


100%|██████████| 2947/2947 [00:04<00:00, 603.01it/s]


Epoch 7/30, Train Loss: 5.8946, Val Loss: 6.5761


100%|██████████| 2947/2947 [00:04<00:00, 618.15it/s]


Epoch 8/30, Train Loss: 5.7969, Val Loss: 6.5631


100%|██████████| 2947/2947 [00:04<00:00, 597.22it/s]


Epoch 9/30, Train Loss: 5.7049, Val Loss: 6.5536


100%|██████████| 2947/2947 [00:04<00:00, 629.02it/s]


Epoch 10/30, Train Loss: 5.6174, Val Loss: 6.5475


100%|██████████| 2947/2947 [00:04<00:00, 606.69it/s]


Epoch 11/30, Train Loss: 5.5340, Val Loss: 6.5417


100%|██████████| 2947/2947 [00:04<00:00, 611.99it/s]


Epoch 12/30, Train Loss: 5.4546, Val Loss: 6.5374


100%|██████████| 2947/2947 [00:04<00:00, 594.99it/s]


Epoch 13/30, Train Loss: 5.3780, Val Loss: 6.5363


100%|██████████| 2947/2947 [00:04<00:00, 594.97it/s]


Epoch 14/30, Train Loss: 5.3046, Val Loss: 6.5367


100%|██████████| 2947/2947 [00:04<00:00, 599.47it/s]


Epoch 15/30, Train Loss: 5.2341, Val Loss: 6.5388


100%|██████████| 2947/2947 [00:04<00:00, 601.37it/s]


Epoch 16/30, Train Loss: 5.1659, Val Loss: 6.5428


100%|██████████| 2947/2947 [00:04<00:00, 600.68it/s]


Epoch 17/30, Train Loss: 5.1006, Val Loss: 6.5488


100%|██████████| 2947/2947 [00:04<00:00, 597.41it/s]


Epoch 18/30, Train Loss: 5.0372, Val Loss: 6.5538


100%|██████████| 2947/2947 [00:04<00:00, 603.27it/s]


Epoch 19/30, Train Loss: 4.9763, Val Loss: 6.5598


100%|██████████| 2947/2947 [00:04<00:00, 605.87it/s]


Epoch 20/30, Train Loss: 4.9172, Val Loss: 6.5677


100%|██████████| 2947/2947 [00:04<00:00, 605.40it/s]


Epoch 21/30, Train Loss: 4.8599, Val Loss: 6.5780


100%|██████████| 2947/2947 [00:04<00:00, 596.06it/s]


Epoch 22/30, Train Loss: 4.8051, Val Loss: 6.5851


100%|██████████| 2947/2947 [00:04<00:00, 599.72it/s]


Epoch 23/30, Train Loss: 4.7515, Val Loss: 6.5967


100%|██████████| 2947/2947 [00:04<00:00, 593.77it/s]


Epoch 24/30, Train Loss: 4.6999, Val Loss: 6.6073


100%|██████████| 2947/2947 [00:04<00:00, 593.85it/s]


Epoch 25/30, Train Loss: 4.6496, Val Loss: 6.6192


100%|██████████| 2947/2947 [00:05<00:00, 588.79it/s]


Epoch 26/30, Train Loss: 4.6012, Val Loss: 6.6325


100%|██████████| 2947/2947 [00:04<00:00, 596.66it/s]


Epoch 27/30, Train Loss: 4.5542, Val Loss: 6.6450


100%|██████████| 2947/2947 [00:04<00:00, 605.17it/s]


Epoch 28/30, Train Loss: 4.5084, Val Loss: 6.6571


100%|██████████| 2947/2947 [00:04<00:00, 613.12it/s]


Epoch 29/30, Train Loss: 4.4643, Val Loss: 6.6721


100%|██████████| 2947/2947 [00:04<00:00, 605.58it/s]


Epoch 30/30, Train Loss: 4.4211, Val Loss: 6.6831


In [11]:
def train_cbow_model(dataset, embedding_dim=128, batch_size=32, num_epochs=30, learning_rate=0.005):
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (context, target) in tqdm(enumerate(dataloader), total=len(dataloader)):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            log_probs = model(context)
            loss = criterion(log_probs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

    return model

# Train the model
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=5)
model = train_cbow_model(cbow_dataset)


Generated 117866 samples from 25000 sentences


100%|██████████| 3684/3684 [00:05<00:00, 618.20it/s]


Epoch 1/10, Loss: 6.7001


100%|██████████| 3684/3684 [00:05<00:00, 648.63it/s]


Epoch 2/10, Loss: 5.2529


100%|██████████| 3684/3684 [00:05<00:00, 632.21it/s]


Epoch 3/10, Loss: 4.2353


100%|██████████| 3684/3684 [00:05<00:00, 621.91it/s]


Epoch 4/10, Loss: 3.4047


100%|██████████| 3684/3684 [00:06<00:00, 610.97it/s]


Epoch 5/10, Loss: 2.7317


100%|██████████| 3684/3684 [00:05<00:00, 616.32it/s]


Epoch 6/10, Loss: 2.2111


100%|██████████| 3684/3684 [00:05<00:00, 618.99it/s]


Epoch 7/10, Loss: 1.8150


100%|██████████| 3684/3684 [00:06<00:00, 612.51it/s]


Epoch 8/10, Loss: 1.5185


100%|██████████| 3684/3684 [00:05<00:00, 621.85it/s]


Epoch 9/10, Loss: 1.2894


100%|██████████| 3684/3684 [00:05<00:00, 625.12it/s]


Epoch 10/10, Loss: 1.1183


In [37]:
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=5)
# Create an inverted dictionary for mapping indices back to words
idx_to_word = {idx: word for word, idx in cbow_dataset.word_to_idx.items()}

# Print out the context and target words for the first few samples
for i in range(9):  # Change 5 to however many samples you'd like to see
    context_indices, target_index = cbow_dataset[i]
    context_words = [idx_to_word[idx.item()] for idx in context_indices]
    target_word = idx_to_word[target_index.item()]
    print(f"Context: {context_words}, Target: {target_word}")



Generated 117866 samples from 25000 sentences
Context: ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against', 'early'], Target: of
Context: ['originated', 'as', 'a', 'term', 'of', 'first', 'used', 'against', 'early', 'working'], Target: abuse
Context: ['as', 'a', 'term', 'of', 'abuse', 'used', 'against', 'early', 'working', 'class'], Target: first
Context: ['a', 'term', 'of', 'abuse', 'first', 'against', 'early', 'working', 'class', 'radicals'], Target: used
Context: ['term', 'of', 'abuse', 'first', 'used', 'early', 'working', 'class', 'radicals', 'including'], Target: against
Context: ['of', 'abuse', 'first', 'used', 'against', 'working', 'class', 'radicals', 'including', 'the'], Target: early
Context: ['of', 'the', 'french', 'revolution', 'whilst', 'term', 'is', 'still', 'used', 'in'], Target: the
Context: ['the', 'french', 'revolution', 'whilst', 'the', 'is', 'still', 'used', 'in', 'a'], Target: term
Context: ['describe', 'any', 'act', 'that', 'used', 'm

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move model to the appropriate device


CBOWModel(
  (embeddings): Embedding(8778, 128)
  (linear): Linear(in_features=128, out_features=8778, bias=True)
)

In [34]:
def evaluate_model(model, test_sentences, dataset):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        for sentence in test_sentences:
            # Get context words' indices
            context_indices = [dataset.word_to_idx[word] for word in sentence]
            context_tensor = torch.tensor(context_indices).unsqueeze(0).to(device)  # Move tensor to GPU

            # Get the model's prediction
            output = model(context_tensor)
            predicted_idx = torch.argmax(output, dim=1).item()

            # Convert the predicted index back to the word
            predicted_word = dataset.idx_to_word[predicted_idx]
            print(f"Context: {sentence}, Predicted word: {predicted_word}")


In [39]:


# Example sentences to evaluate the model
test_sentences = [
    ['describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as'],
    ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against', 'early'],
    ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against', 'early'],
]

# Evaluate the model using the test sentences
evaluate_model(model, test_sentences, cbow_dataset)


Context: ['describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as'], Predicted word: the
Context: ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against', 'early'], Predicted word: the
Context: ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against', 'early'], Predicted word: the


In [76]:
cbow_dataset = CBOWDataset(sentences)
print(f"Vocabulary size: {len(cbow_dataset.vocab)}")
print(f"Number of samples in the dataset: {len(cbow_dataset)}")


Generated 120143 samples from 12500 sentences
Vocabulary size: 5347
Number of samples in the dataset: 120143


In [41]:
cbow_dataset = CBOWDataset(sentences, min_count=1)  # Lower min_count to include more words


In [43]:
for i, sentence in enumerate(sentences[:5]):
    print(f"Sentence {i+1}: {sentence}")



Sentence 1: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english']
Sentence 2: ['revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to']
Sentence 3: ['describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as']
Sentence 4: ['a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king']
Sentence 5: ['anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing']
