# CBOW dataset
 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import random
import pandas as pd
from tqdm import tqdm 
from gensim.models import Word2Vec

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
# Count total tokens in the entire dataset
total_tokens = sum(len(sentence) for sentence in sentences)

print(f"Total number of tokens in the dataset: {total_tokens}")


NameError: name 'sentences' is not defined

In [3]:
def load_and_preprocess_data(file_path, max_tokens=20000):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Split text into words
    words = text.split()

    # Limit to the first 2000 tokens
    words = words[:max_tokens]

    # Convert list of words into sequences of length `context_size`
    sequence_length = 20  # Sequence length could be smaller for CBOW
    sequences = [words[i:i + sequence_length] for i in range(0, len(words), sequence_length)]

    return sequences

# Load the data
sentences = load_and_preprocess_data('text8', max_tokens=500000)


In [4]:
class CBOWDataset(Dataset):
    def __init__(self, sentences, min_count=5, context_size=2):
        self.sentences = sentences
        self.context_size = context_size
        self.vocab = self.build_vocab(sentences, min_count)
        self.word_to_idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx_to_word = {i: word for i, word in enumerate(self.vocab)}
        self.data = []

        for sentence in sentences:
            for i in range(context_size, len(sentence) - context_size):
                context = (
                    sentence[i - context_size:i] + sentence[i + 1:i + context_size + 1]
                )
                target = sentence[i]
                
                if all(word in self.word_to_idx for word in context + [target]):
                    self.data.append((context, target))
        
        # Debugging statement to check samples
        print(f"Generated {len(self.data)} samples from {len(sentences)} sentences")

    def build_vocab(self, sentences, min_count):
        word_counts = Counter([word for sentence in sentences for word in sentence])
        return [word for word, count in word_counts.items() if count >= min_count]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_idxs = torch.tensor([self.word_to_idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor(self.word_to_idx[target], dtype=torch.long)
        return context_idxs, target_idx


In [14]:
# class CBOWModel(nn.Module):
#     def __init__(self, vocab_size, embedding_dim):
#         super(CBOWModel, self).__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.linear = nn.Linear(embedding_dim, vocab_size)

#     def forward(self, inputs):
#         # Inputs are context word indices
#         embeds = self.embeddings(inputs)
#         context_embed = torch.mean(embeds, dim=1)  # Mean of context embeddings
#         output = self.linear(context_embed)
#         return output


In [20]:
class CBOWModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.linear = nn.Linear(
            in_features=embedding_dim,
            out_features=vocab_size,
        )

    def forward(self, inputs_):
        # Get the embeddings for the context words
        x = self.embeddings(inputs_)
        # Mean of the context word embeddings
        x = x.mean(axis=1)
        # Pass through linear layer to predict target word
        x = self.linear(x)
        return x


In [21]:
class CBOWDataset(Dataset):
    def __init__(self, sentences, min_count=5, context_size=2):
        self.sentences = sentences
        self.context_size = context_size
        self.vocab = self.build_vocab(sentences, min_count)
        self.word_to_idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx_to_word = {i: word for i, word in enumerate(self.vocab)}
        self.data = []

        for sentence in sentences:
            # Ensure enough context around the target word
            for i in range(context_size, len(sentence) - context_size):
                context = (
                    sentence[i - context_size:i] + sentence[i + 1:i + context_size + 1]
                )
                target = sentence[i]
                
                if all(word in self.word_to_idx for word in context + [target]):
                    self.data.append((context, target))
        
        print(f"Generated {len(self.data)} samples from {len(sentences)} sentences")

    def build_vocab(self, sentences, min_count):
        word_counts = Counter([word for sentence in sentences for word in sentence])
        return [word for word, count in word_counts.items() if count >= min_count]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_idxs = torch.tensor([self.word_to_idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor(self.word_to_idx[target], dtype=torch.long)
        return context_idxs, target_idx


In [24]:
def train_cbow_model(dataset, embedding_dim=128, batch_size=32, num_epochs=50, learning_rate=0.0001, val_split=0.2):
    # Split the dataset into training and validation sets
    train_size = int(len(dataset) * (1 - val_split))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    wandb.init(project="cbow_training", entity="omareweis123")  # Initialize W&B
    
    wandb.config.update({
        "embedding_dim": embedding_dim,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "learning_rate": learning_rate,
        "val_split": val_split
    })

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        
        for batch_idx, (context, target) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            log_probs = model(context)  # Get raw logits from the model
            loss = criterion(log_probs, target)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

            wandb.log({"batch_train_loss": loss.item(), "epoch": epoch + 1})  # Log batch loss
        
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Validation loop
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for context, target in val_dataloader:
                context, target = context.to(device), target.to(device)
                log_probs = model(context)
                loss = criterion(log_probs, target)
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_dataloader)

        # Log epoch-level losses
        wandb.log({
            "epoch_train_loss": avg_train_loss,
            "epoch_val_loss": avg_val_loss,
            "epoch": epoch + 1
        })

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    torch.save(model.state_dict(), "cbow_model_weights.pth")
    wandb.finish()  # Complete the W&B run

    return model


In [25]:
# Train the model with dataset split into training and validation sets
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=2)
model = train_cbow_model(cbow_dataset)

Generated 276575 samples from 25000 sentences


100%|██████████| 6915/6915 [00:11<00:00, 626.79it/s]


Epoch 1/50, Train Loss: 6.3153, Val Loss: 5.9562


100%|██████████| 6915/6915 [00:11<00:00, 581.56it/s]


Epoch 2/50, Train Loss: 4.9572, Val Loss: 5.9730


100%|██████████| 6915/6915 [00:12<00:00, 571.42it/s]


Epoch 3/50, Train Loss: 4.2072, Val Loss: 6.1804


100%|██████████| 6915/6915 [00:12<00:00, 573.55it/s]


Epoch 4/50, Train Loss: 3.7067, Val Loss: 6.4496


100%|██████████| 6915/6915 [00:11<00:00, 576.42it/s]


Epoch 5/50, Train Loss: 3.3705, Val Loss: 6.7398


100%|██████████| 6915/6915 [00:11<00:00, 583.62it/s]


Epoch 6/50, Train Loss: 3.1385, Val Loss: 7.0152


100%|██████████| 6915/6915 [00:11<00:00, 576.71it/s]


Epoch 7/50, Train Loss: 2.9711, Val Loss: 7.2708


100%|██████████| 6915/6915 [00:12<00:00, 563.95it/s]


Epoch 8/50, Train Loss: 2.8491, Val Loss: 7.4950


100%|██████████| 6915/6915 [00:12<00:00, 570.05it/s]


Epoch 9/50, Train Loss: 2.7519, Val Loss: 7.7207


100%|██████████| 6915/6915 [00:12<00:00, 575.26it/s]


Epoch 10/50, Train Loss: 2.6750, Val Loss: 7.9232


100%|██████████| 6915/6915 [00:11<00:00, 576.28it/s]


Epoch 11/50, Train Loss: 2.6124, Val Loss: 8.1160


100%|██████████| 6915/6915 [00:12<00:00, 565.48it/s]


Epoch 12/50, Train Loss: 2.5641, Val Loss: 8.3034


100%|██████████| 6915/6915 [00:12<00:00, 575.29it/s]


Epoch 13/50, Train Loss: 2.5200, Val Loss: 8.4706


100%|██████████| 6915/6915 [00:12<00:00, 575.73it/s]


Epoch 14/50, Train Loss: 2.4836, Val Loss: 8.6329


100%|██████████| 6915/6915 [00:11<00:00, 582.56it/s]


Epoch 15/50, Train Loss: 2.4493, Val Loss: 8.7868


100%|██████████| 6915/6915 [00:11<00:00, 577.03it/s]


Epoch 16/50, Train Loss: 2.4224, Val Loss: 8.9332


100%|██████████| 6915/6915 [00:12<00:00, 571.26it/s]


Epoch 17/50, Train Loss: 2.3980, Val Loss: 9.0707


100%|██████████| 6915/6915 [00:12<00:00, 574.57it/s]


Epoch 18/50, Train Loss: 2.3763, Val Loss: 9.2064


100%|██████████| 6915/6915 [00:12<00:00, 574.23it/s]


Epoch 19/50, Train Loss: 2.3538, Val Loss: 9.3510


100%|██████████| 6915/6915 [00:12<00:00, 569.51it/s]


Epoch 20/50, Train Loss: 2.3383, Val Loss: 9.4601


100%|██████████| 6915/6915 [00:12<00:00, 573.86it/s]


Epoch 21/50, Train Loss: 2.3215, Val Loss: 9.5965


100%|██████████| 6915/6915 [00:12<00:00, 566.37it/s]


Epoch 22/50, Train Loss: 2.3046, Val Loss: 9.7180


100%|██████████| 6915/6915 [00:11<00:00, 581.71it/s]


Epoch 23/50, Train Loss: 2.2907, Val Loss: 9.8352


100%|██████████| 6915/6915 [00:11<00:00, 577.25it/s]


Epoch 24/50, Train Loss: 2.2781, Val Loss: 9.9471


100%|██████████| 6915/6915 [00:12<00:00, 570.97it/s]


Epoch 25/50, Train Loss: 2.2676, Val Loss: 10.0611


100%|██████████| 6915/6915 [00:12<00:00, 565.98it/s]


Epoch 26/50, Train Loss: 2.2536, Val Loss: 10.1679


100%|██████████| 6915/6915 [00:12<00:00, 573.11it/s]


Epoch 27/50, Train Loss: 2.2461, Val Loss: 10.2743


100%|██████████| 6915/6915 [00:12<00:00, 573.24it/s]


Epoch 28/50, Train Loss: 2.2383, Val Loss: 10.3647


100%|██████████| 6915/6915 [00:12<00:00, 573.94it/s]


Epoch 29/50, Train Loss: 2.2292, Val Loss: 10.4622


100%|██████████| 6915/6915 [00:12<00:00, 575.75it/s]


Epoch 30/50, Train Loss: 2.2206, Val Loss: 10.5621


100%|██████████| 6915/6915 [00:12<00:00, 569.11it/s]


Epoch 31/50, Train Loss: 2.2142, Val Loss: 10.6721


100%|██████████| 6915/6915 [00:12<00:00, 575.42it/s]


Epoch 32/50, Train Loss: 2.2034, Val Loss: 10.7499


100%|██████████| 6915/6915 [00:12<00:00, 575.78it/s]


Epoch 33/50, Train Loss: 2.1976, Val Loss: 10.8379


100%|██████████| 6915/6915 [00:12<00:00, 567.80it/s]


Epoch 34/50, Train Loss: 2.1903, Val Loss: 10.9344


100%|██████████| 6915/6915 [00:12<00:00, 568.02it/s]


Epoch 35/50, Train Loss: 2.1832, Val Loss: 11.0538


100%|██████████| 6915/6915 [00:12<00:00, 564.22it/s]


Epoch 36/50, Train Loss: 2.1767, Val Loss: 11.1619


100%|██████████| 6915/6915 [00:12<00:00, 572.54it/s]


Epoch 37/50, Train Loss: 2.1762, Val Loss: 11.2078


100%|██████████| 6915/6915 [00:12<00:00, 570.02it/s]


Epoch 38/50, Train Loss: 2.1668, Val Loss: 11.3175


100%|██████████| 6915/6915 [00:12<00:00, 569.77it/s]


Epoch 39/50, Train Loss: 2.1612, Val Loss: 11.4016


100%|██████████| 6915/6915 [00:12<00:00, 564.69it/s]


Epoch 40/50, Train Loss: 2.1550, Val Loss: 11.4846


100%|██████████| 6915/6915 [00:11<00:00, 577.24it/s]


Epoch 41/50, Train Loss: 2.1503, Val Loss: 11.5787


100%|██████████| 6915/6915 [00:12<00:00, 573.74it/s]


Epoch 42/50, Train Loss: 2.1490, Val Loss: 11.6373


100%|██████████| 6915/6915 [00:12<00:00, 566.50it/s]


Epoch 43/50, Train Loss: 2.1447, Val Loss: 11.7084


100%|██████████| 6915/6915 [00:12<00:00, 562.55it/s]


Epoch 44/50, Train Loss: 2.1373, Val Loss: 11.8070


100%|██████████| 6915/6915 [00:12<00:00, 572.55it/s]


Epoch 45/50, Train Loss: 2.1343, Val Loss: 11.8996


100%|██████████| 6915/6915 [00:12<00:00, 574.21it/s]


Epoch 46/50, Train Loss: 2.1320, Val Loss: 11.9717


100%|██████████| 6915/6915 [00:12<00:00, 570.36it/s]


Epoch 47/50, Train Loss: 2.1281, Val Loss: 12.0550


100%|██████████| 6915/6915 [00:12<00:00, 575.60it/s]


Epoch 48/50, Train Loss: 2.1207, Val Loss: 12.1342


100%|██████████| 6915/6915 [00:12<00:00, 572.59it/s]


Epoch 49/50, Train Loss: 2.1199, Val Loss: 12.2120


100%|██████████| 6915/6915 [00:12<00:00, 571.03it/s]


Epoch 50/50, Train Loss: 2.1153, Val Loss: 12.2943


0,1
batch_train_loss,█▇▄▅▃▄▂▂▂▄▂▁▂▂▃▂▂▂▃▃▁▂▃▂▃▂▂▃▁▂▂▂▂▂▂▃▁▂▃▂
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
epoch_train_loss,█▆▄▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_val_loss,▁▁▁▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████

0,1
batch_train_loss,1.25188
epoch,50.0
epoch_train_loss,2.11531
epoch_val_loss,12.29427


In [6]:
def train_model(dataset, embedding_dim=512, batch_size=32, num_epochs=25, learning_rate=0.005):
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (context_words, target_word) in tqdm(enumerate(dataloader), total=len(dataloader)):
            context_words, target_word = context_words.to(device), target_word.to(device)
            optimizer.zero_grad()
            log_probs = model(context_words)
            loss = criterion(log_probs, target_word)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")
    
    return model


In [17]:
from sklearn.model_selection import train_test_split

def train_cbow_model(dataset, embedding_dim=128, batch_size=32, num_epochs=5, learning_rate=0.005, val_split=0.2):
    # Split the dataset into training and validation sets
    train_size = int(len(dataset) * (1 - val_split))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model setup
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0
        
        # Training loop
        for batch_idx, (context, target) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            log_probs = model(context)
            loss = criterion(log_probs, target)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        with torch.no_grad():  # Disable gradient calculation for validation
            for context, target in val_dataloader:
                context, target = context.to(device), target.to(device)
                log_probs = model(context)
                loss = criterion(log_probs, target)
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_dataloader)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    return model

# Train the model with dataset split into training and validation sets
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=2)
model = train_cbow_model(cbow_dataset)


Generated 276575 samples from 25000 sentences


TypeError: CBOWModel.__init__() takes 2 positional arguments but 3 were given

In [19]:
import wandb
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# Initialize W&B project
wandb.init(project="cbow_training", entity="omareweis123")  # Replace with your W&B username

def train_cbow_model(dataset, embedding_dim=128, batch_size=32, num_epochs=5, learning_rate=0.005, val_split=0.2):
    # Split the dataset into training and validation sets
    train_size = int(len(dataset) * (1 - val_split))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model setup
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Track hyperparameters and model architecture in W&B
    wandb.config.update({
        "embedding_dim": embedding_dim,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "learning_rate": learning_rate,
        "val_split": val_split
    })

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0
        
        # Training loop
        for batch_idx, (context, target) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            log_probs = model(context)
            loss = criterion(log_probs, target)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

            # Log batch loss to W&B
            wandb.log({"batch_train_loss": loss.item(), "epoch": epoch+1})
        
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        with torch.no_grad():  # Disable gradient calculation for validation
            for context, target in val_dataloader:
                context, target = context.to(device), target.to(device)
                log_probs = model(context)
                loss = criterion(log_probs, target)
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_dataloader)

        # Log epoch-level train/val losses to W&B
        wandb.log({
            "epoch_train_loss": avg_train_loss,
            "epoch_val_loss": avg_val_loss,
            "epoch": epoch+1
        })

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Save the model weights
    torch.save(model.state_dict(), "cbow_model_weights.pth")

    WANDB_HTTP_TIMEOUT=500
    
    # Finish the W&B run
    wandb.finish()

    return model

# Train the model with dataset split into training and validation sets
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=2)
model = train_cbow_model(cbow_dataset)


Generated 276575 samples from 25000 sentences


TypeError: super(type, obj): obj must be an instance or subtype of type

In [16]:
def train_cbow_model(dataset, embedding_dim=128, batch_size=32, num_epochs=10, learning_rate=0.05):
    model = CBOWModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (context, target) in tqdm(enumerate(dataloader), total=len(dataloader)):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            log_probs = model(context)
            loss = criterion(log_probs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

    return model

# Train the model
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=2)
model = train_cbow_model(cbow_dataset)


Generated 3525551 samples from 250000 sentences


100%|██████████| 110174/110174 [03:29<00:00, 526.74it/s]


Epoch 1/10, Loss: 6.5378


100%|██████████| 110174/110174 [03:31<00:00, 520.59it/s]


Epoch 2/10, Loss: 6.1894


100%|██████████| 110174/110174 [03:32<00:00, 518.73it/s]


Epoch 3/10, Loss: 6.0948


100%|██████████| 110174/110174 [03:32<00:00, 518.59it/s]


Epoch 4/10, Loss: 6.0515


100%|██████████| 110174/110174 [03:27<00:00, 530.37it/s]


Epoch 5/10, Loss: 6.0272


100%|██████████| 110174/110174 [03:29<00:00, 527.03it/s]


Epoch 6/10, Loss: 6.0135


100%|██████████| 110174/110174 [03:30<00:00, 522.66it/s]


Epoch 7/10, Loss: 6.0040


100%|██████████| 110174/110174 [03:25<00:00, 535.74it/s]


Epoch 8/10, Loss: 5.9980


100%|██████████| 110174/110174 [03:25<00:00, 534.99it/s]


Epoch 9/10, Loss: 5.9935


100%|██████████| 110174/110174 [03:32<00:00, 519.11it/s]

Epoch 10/10, Loss: 5.9910





In [14]:
cbow_dataset = CBOWDataset(sentences, min_count=5, context_size=5)
# Create an inverted dictionary for mapping indices back to words
idx_to_word = {idx: word for word, idx in cbow_dataset.word_to_idx.items()}

# Print out the context and target words for the first few samples
for i in range(9):  # Change 5 to however many samples you'd like to see
    context_indices, target_index = cbow_dataset[i]
    context_words = [idx_to_word[idx.item()] for idx in context_indices]
    target_word = idx_to_word[target_index.item()]
    print(f"Context: {context_words}, Target: {target_word}")



Generated 117866 samples from 25000 sentences
Context: ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against', 'early'], Target: of
Context: ['originated', 'as', 'a', 'term', 'of', 'first', 'used', 'against', 'early', 'working'], Target: abuse
Context: ['as', 'a', 'term', 'of', 'abuse', 'used', 'against', 'early', 'working', 'class'], Target: first
Context: ['a', 'term', 'of', 'abuse', 'first', 'against', 'early', 'working', 'class', 'radicals'], Target: used
Context: ['term', 'of', 'abuse', 'first', 'used', 'early', 'working', 'class', 'radicals', 'including'], Target: against
Context: ['of', 'abuse', 'first', 'used', 'against', 'working', 'class', 'radicals', 'including', 'the'], Target: early
Context: ['of', 'the', 'french', 'revolution', 'whilst', 'term', 'is', 'still', 'used', 'in'], Target: the
Context: ['the', 'french', 'revolution', 'whilst', 'the', 'is', 'still', 'used', 'in', 'a'], Target: term
Context: ['describe', 'any', 'act', 'that', 'used', 'm

In [14]:
cbow_dataset = CBOWDataset(sentences,min_count=5, context_size=5)
print(f"Vocabulary size: {len(cbow_dataset.vocab)}")
print(f"Number of samples in the dataset: {len(cbow_dataset)}")


Generated 1934891 samples from 250000 sentences
Vocabulary size: 36280
Number of samples in the dataset: 1934891


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move model to the appropriate device


CBOWModel(
  (embeddings): Embedding(36280, 128)
  (linear): Linear(in_features=128, out_features=36280, bias=True)
)

In [18]:
def evaluate_model(model, test_sentences, dataset):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        for sentence in test_sentences:
            # Get context words' indices
            context_indices = [dataset.word_to_idx[word] for word in sentence]
            context_tensor = torch.tensor(context_indices).unsqueeze(0).to(device)  # Move tensor to GPU

            # Get the model's prediction
            output = model(context_tensor)
            predicted_idx = torch.argmax(output, dim=1).item()

            # Convert the predicted index back to the word
            predicted_word = dataset.idx_to_word[predicted_idx]
            print(f"Context: {sentence}, Predicted word: {predicted_word}")


In [20]:


# Example sentences to evaluate the model
test_sentences = [
    ['describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as'],
    ['term', 'of', 'abuse', 'first', 'used', 'early', 'working', 'class', 'radicals', 'including'],
    ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against'],
]

# Evaluate the model using the test sentences
evaluate_model(model, test_sentences, cbow_dataset)


Context: ['describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as'], Predicted word: the
Context: ['term', 'of', 'abuse', 'first', 'used', 'early', 'working', 'class', 'radicals', 'including'], Predicted word: the
Context: ['anarchism', 'originated', 'as', 'a', 'term', 'abuse', 'first', 'used', 'against'], Predicted word: the


In [41]:
cbow_dataset = CBOWDataset(sentences, min_count=1)  # Lower min_count to include more words


In [43]:
for i, sentence in enumerate(sentences[:5]):
    print(f"Sentence {i+1}: {sentence}")



Sentence 1: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english']
Sentence 2: ['revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to']
Sentence 3: ['describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as']
Sentence 4: ['a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king']
Sentence 5: ['anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing']
