In [3]:
def preprocess_corpus(file_path, window_size=2):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = f.read().strip().split()[:200000]  # Split the text into words
        

    data = []
    for i in range(window_size, len(words) - window_size):
        context = words[i - window_size:i] + words[i + 1:i + window_size + 1]
        target = words[i]
        data.append((context, target))
    return data

# Example usage
file_path = "text8.txt"
window_size = 3
data = preprocess_corpus(file_path, window_size)
print(f"Sample data: {data[:5]}")

Sample data: [(['anarchism', 'originated', 'as', 'term', 'of', 'abuse'], 'a'), (['originated', 'as', 'a', 'of', 'abuse', 'first'], 'term'), (['as', 'a', 'term', 'abuse', 'first', 'used'], 'of'), (['a', 'term', 'of', 'first', 'used', 'against'], 'abuse'), (['term', 'of', 'abuse', 'used', 'against', 'early'], 'first')]


In [None]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def build_vocab(self, corpus):
        for word in corpus:
            if word not in self.word2idx:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)

# Build vocabulary from the corpus
vocab = Vocabulary()
vocab.build_vocab([word for context, target in data for word in context + [target]])
print(f"Vocabulary size: {len(vocab.word2idx)}")

In [None]:
import torch
import torch.nn.functional as F

vocab = {
    "Hello": 72,
    "my": 44,
    "name": 21,
    "is": 93,
    "Bes": 11
}

sentence = ["Hello", "my", "is", "Bes"]

class CBOW(torch.nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        self.emb = torch.nn.Embedding(128, 9)
        self.linear = torch.nn.Linear(9, 128)

    def forward(self, inputs):
        embs = self.emb(inputs)
        embs = embs.mean(dim=1)
        out = self.linear(embs)
        probs = F.log_softmax(out, dim=1)
        return probs


In [2]:
def train(model, sentence, vocab, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for i in range(len(sentence)):
            context = sentence[:i] + sentence[i+1:]
            target = sentence[i]

            context_idxs = torch.tensor([vocab[w] for w in context], dtype=torch.long)
            target_idx = torch.tensor([vocab[target]], dtype=torch.long)

            model.zero_grad()
            log_probs = model(context_idxs.unsqueeze(0))

            loss = F.nll_loss(log_probs, target_idx)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch: {epoch+1}, Loss: {total_loss}")

# Example usage
model = CBOW()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
train(model, sentence, vocab, optimizer, epochs=10)

Epoch: 1, Loss: 19.655333042144775
Epoch: 2, Loss: 19.640313148498535
Epoch: 3, Loss: 19.62529754638672
Epoch: 4, Loss: 19.61028528213501
Epoch: 5, Loss: 19.595277309417725
Epoch: 6, Loss: 19.58027219772339
Epoch: 7, Loss: 19.565271377563477
Epoch: 8, Loss: 19.550273895263672
Epoch: 9, Loss: 19.53528118133545
Epoch: 10, Loss: 19.520292282104492


In [4]:
import torch
import torch.nn.functional as F

class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def build_vocab(self, corpus):
        for word in corpus:
            if word not in self.word2idx:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)

class CBOW(torch.nn.Module):
    def __init__(self, vocab_size):
        super(CBOW, self).__init__()
        self.emb = torch.nn.Embedding(vocab_size, 9)
        self.linear = torch.nn.Linear(9, vocab_size)

    def forward(self, inputs):
        embs = self.emb(inputs)
        embs = embs.mean(dim=1)
        out = self.linear(embs)
        probs = F.log_softmax(out, dim=1)
        return probs

def preprocess_corpus(file_path, window_size=2):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = f.read().strip().split()[:200000]  # Split the text into words
        

    data = []
    for i in range(window_size, len(words) - window_size):
        context = words[i - window_size:i] + words[i + 1:i + window_size + 1]
        target = words[i]
        data.append((context, target))
    return data




In [5]:
def train(model, data, vocab, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for context, target in data:
            context_idxs = torch.tensor([vocab.word2idx[w] for w in context], dtype=torch.long).unsqueeze(0)
            target_idx = torch.tensor([vocab.word2idx[target]], dtype=torch.long)

            model.zero_grad()
            log_probs = model(context_idxs)

            loss = F.nll_loss(log_probs, target_idx)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch: {epoch+1}, Loss: {total_loss}")

# Example usage
file_path = "text8.txt"
window_size = 2
data = preprocess_corpus(file_path, window_size)

# Build vocabulary from the corpus
vocab = Vocabulary()
vocab.build_vocab([word for context, target in data for word in context + [target]])
print(f"Vocabulary size: {len(vocab.word2idx)}")

vocab_size = len(vocab.word2idx)
model = CBOW(vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
train(model, data, vocab, optimizer, epochs=3)

Vocabulary size: 19062
Epoch: 1, Loss: 1810987.8712714612


KeyboardInterrupt: 

In [27]:
import torch
import torch.nn.functional as F

class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def build_vocab(self, corpus):
        for word in corpus:
            if word not in self.word2idx:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)

class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.emb = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embs = self.emb(inputs)
        embs = embs.mean(dim=1)
        out = self.linear(embs)
        probs = F.log_softmax(out, dim=1)
        return probs

def preprocess_corpus(file_path, window_size=2):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = f.read().strip().split()[:1000000]  # Split the text into words
        

    data = []
    for i in range(window_size, len(words) - window_size):
        context = words[i - window_size:i] + words[i + 1:i + window_size + 1]
        target = words[i]
        data.append((context, target))
    return data

def create_batches(data, vocab, batch_size=32):
    contexts, targets = [], []
    for context, target in data:
        context_idxs = [vocab.word2idx[w] for w in context]
        target_idx = vocab.word2idx[target]
        contexts.append(context_idxs)
        targets.append(target_idx)
    
    # Convert to tensors
    contexts = torch.tensor(contexts, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)

    # Create batches
    num_batches = len(contexts) // batch_size
    batches = []
    for i in range(num_batches):
        batch_contexts = contexts[i*batch_size:(i+1)*batch_size]
        batch_targets = targets[i*batch_size:(i+1)*batch_size]
        batches.append((batch_contexts, batch_targets))

    # Handle the last batch if it's smaller than batch_size
    if len(contexts) % batch_size != 0:
        batch_contexts = contexts[num_batches*batch_size:]
        batch_targets = targets[num_batches*batch_size:]
        batches.append((batch_contexts, batch_targets))
    
    return batches

In [26]:
def train(model, data, vocab, optimizer, epochs=10, batch_size=32):
    model.train()
    batches = create_batches(data, vocab, batch_size)
    for epoch in range(epochs):
        total_loss = 0
        for batch_contexts, batch_targets in batches:
            model.zero_grad()
            log_probs = model(batch_contexts)
            loss = F.nll_loss(log_probs, batch_targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch: {epoch+1}, Loss: {total_loss}")

# Example usage
file_path = "text8.txt"
window_size = 2
data = preprocess_corpus(file_path, window_size)

# Build vocabulary from the corpus
vocab = Vocabulary()
vocab.build_vocab([word for context, target in data for word in context + [target]])
print(f"Vocabulary size: {len(vocab.word2idx)}")

vocab_size = len(vocab.word2idx)
embedding_dim = 10  # Choose an appropriate embedding dimension
model = CBOW(vocab_size, embedding_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
train(model, data, vocab, optimizer, epochs=10, batch_size=256)


Vocabulary size: 78382


KeyboardInterrupt: 

In [20]:
import torch
import torch.nn as nn

# Define the words to find similar words for
target_words = ["finding", "sample", "most"]

# Function to get word embeddings
def get_embedding(word):
    idx = vocab.get(word)
    if idx is None:
        print(f"'{word}' is not in the vocabulary.")
        return None
    return model.emb.weight[idx]

# Function to find the most similar words
def find_most_similar(word, top_n=5, excluded_words=None):
    embedding = get_embedding(word)
    if embedding is None:
        return

    cos_sim = nn.CosineSimilarity(dim=0)
    similarities = {}
    for other_word, other_idx in vocab.items():
        if other_word != word:
            other_embedding = model.emb.weight[other_idx]
            similarity = cos_sim(embedding, other_embedding)
            similarities[other_word] = similarity.item()

    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)

    print(f"Top {top_n} words similar to '{word}':")
    count = 0
    for similar_word, similarity_score in sorted_similarities:
        if excluded_words is None or similar_word not in excluded_words:
            print(f"  {similar_word}: {similarity_score:.4f}")
            count += 1
            if count >= top_n:
                break

# Example usage
for word in target_words:
    find_most_similar(word, top_n=5, excluded_words=target_words)



Top 5 words similar to 'finding':
  this: 1.0000
  is: 1.0000
  a: 1.0000
  text: 1.0000
  corpus: 1.0000
Top 5 words similar to 'sample':
  this: 1.0000
  is: 1.0000
  a: 1.0000
  text: 1.0000
  corpus: 1.0000
Top 5 words similar to 'most':
  this: 1.0000
  is: 1.0000
  a: 1.0000
  text: 1.0000
  corpus: 1.0000


In [18]:
import random

# The vocab variable is assumed to be defined in previous cells.
# Ensure vocab is a dictionary with words as keys.
# vocab = {"word1": index1, "word2": index2, ...}

def print_random_words(vocab, num_words=10):
    """Prints a specified number of random words from the vocabulary."""
    words = list(vocab.keys())  # Get a list of words from the vocab dictionary
    if num_words > len(words):
        num_words = len(words)  # Adjust if requesting more words than available
    random_words = random.sample(words, num_words)  # Select random words
    print("Random words from vocabulary:")
    for word in random_words:
        print(word)

# Example usage: Print 10 random words from the vocabulary
print_random_words(vocab, num_words=5)


Random words from vocabulary:
finding
sample
most
find
their
