In [7]:
from collections import Counter
import re
import string
def preprocess_sentences(corpus, min_word_freq=10, min_sentence_len=8):
    # Remove punctuation and numbers (Roman numerals and others)
    corpus = re.sub(r'\b[MDCLXVI]+\b|\d+', '', corpus)
    corpus = corpus.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the corpus
    sentences = [sent.strip().split() for sent in corpus.lower().split('.')]
    
    # Filter out short sentences
    sentences = [sent for sent in sentences if len(sent) >= min_sentence_len]
    
    # Filter out rare words and short sentences
    word_freq = Counter([word for sent in sentences for word in sent])
    sentences = [[word for word in sent if word_freq[word] >= min_word_freq] for sent in sentences]
    
    return [sent for sent in sentences if len(sent) >= min_sentence_len]

In [8]:
from datasets import load_dataset


dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [9]:
corpus = ""
for i in range(5000):
  corpus= corpus +"."+ train_data[i]["text"]
  # print(i)
# print(corpus)


In [10]:
corpus_Set = preprocess_sentences(corpus)

In [16]:
corpus = ""
for x in corpus_Set:
    for i in x:
        corpus = corpus + " " + i

In [17]:
corpus

' valkyria chronicles no valkyria chronicles japanese valkyria of the commonly referred to as valkyria chronicles outside japan is a role playing video game developed by and for the released in january in japan it is the third game in the valkyria series the same of and real time gameplay as its the story runs to the first game and follows the nameless a military unit serving the nation of during the second war who perform secret black operations and are against the imperial unit the game began development in carrying over a large portion of the work done on valkyria chronicles while it the standard features of the series it also multiple such as making the game more for series character and both returned from previous along with valkyria chronicles director a large team of writers the script the game s opening theme was by may n it met with positive sales in japan and was praised by both japanese and western critics after release it received content along with an expanded edition in n

In [20]:
train_text = train_data["text"][:1000]

In [24]:
import numpy as np
from collections import Counter
import random
import re

def preprocess_corpus(corpus, min_word_freq=5):
    """Preprocess the corpus by removing non-English words, Roman numerals, and infrequent words."""
    tokenized_corpus = []
    word_freq = Counter()

    roman_numerals = re.compile(r"\bM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b")

    non_english_words = re.compile(r"[^a-zA-Z]")

    for sentence in corpus:
        words = re.findall(r'\b\w+\b', sentence.lower())

        words = [word for word in words if not non_english_words.search(word) and not roman_numerals.match(word)]

        word_freq.update(words)

        tokenized_corpus.append(words)

    vocab = {word for word, count in word_freq.items() if count >= min_word_freq}

    cleaned_corpus = [[word for word in sentence if word in vocab] for sentence in tokenized_corpus]

    return cleaned_corpus, vocab

def build_vocab(cleaned_corpus):
    """Build vocabulary from the cleaned corpus."""
    word_counts = Counter([word for sentence in cleaned_corpus for word in sentence])
    vocab = {word: i for i, word in enumerate(word_counts.keys())}
    vocab_size = len(vocab)
    return vocab, vocab_size

def generate_batch(tokenized_corpus, vocab, window_size=2, batch_size=64):
    """Generate context-target pairs in batches dynamically."""
    context_target_pairs = []
    for sentence in tokenized_corpus:
        if len(sentence) < window_size * 2 + 1:
            continue
        for i in range(window_size, len(sentence) - window_size):
            target = sentence[i]
            context = sentence[i - window_size:i] + sentence[i + 1:i + 1 + window_size]
            context_target_pairs.append((context, target))
    
    random.shuffle(context_target_pairs)

    for i in range(0, len(context_target_pairs), batch_size):
        batch_contexts, batch_targets = [], []
        batch = context_target_pairs[i:i + batch_size]
        for context_words, target_word in batch:
            batch_contexts.append([one_hot_encode(word, vocab) for word in context_words])
            batch_targets.append(one_hot_encode(target_word, vocab))
        yield np.array(batch_contexts), np.array(batch_targets)

def one_hot_encode(word, vocab):
    """Convert a word into a one-hot encoded vector."""
    one_hot_vector = np.zeros(len(vocab))
    one_hot_vector[vocab[word]] = 1
    return one_hot_vector

class CBOWModel:
    def __init__(self, vocab_size, hidden_size, learning_rate=0.05):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.W = np.random.uniform(-0.8, 0.8, (vocab_size, hidden_size))  # Input -> Hidden
        self.W1 = np.random.uniform(-0.8, 0.8, (hidden_size, vocab_size))  # Hidden -> Output

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x)

    def forward(self, context_vectors):
        h = np.sum(np.dot(context_vectors, self.W), axis=0)  # Sum of context word vectors
        u = np.dot(h, self.W1)  # Hidden layer to output layer
        y_pred = self.softmax(u)  # Softmax activation
        return y_pred, h

    def backward(self, y_pred, target_vector, context_vectors, h):
        error = y_pred - target_vector  # Error at output layer
        dW1 = np.outer(h, error)  # Gradient for W1
        dW = np.sum([np.outer(context, np.dot(self.W1, error)) for context in context_vectors], axis=0)  # Gradient for W
        
        # Update weights
        self.W1 -= self.learning_rate * dW1
        self.W -= self.learning_rate * dW

    def train(self, context_vectors, target_vector):
        # Forward pass
        y_pred, h = self.forward(context_vectors)
        
        # Backward pass
        self.backward(y_pred, target_vector, context_vectors, h)
        
        # Loss (cross-entropy)
        loss = -np.log(y_pred[np.argmax(target_vector)] + 1e-9)  # To avoid log(0)
        return loss

def train_cbow(corpus, hidden_size=50, window_size=2, batch_size=64, epochs=5):
    cleaned_corpus, vocab = preprocess_corpus(corpus)

    vocab, vocab_size = build_vocab(cleaned_corpus)

    model = CBOWModel(vocab_size, hidden_size)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        batch_generator = generate_batch(cleaned_corpus, vocab, window_size, batch_size)
        
        for batch_contexts, batch_targets in batch_generator:
            for i in range(batch_size):
                context_vectors = batch_contexts[i]
                target_vector = batch_targets[i]
                loss = model.train(context_vectors, target_vector)
                total_loss += loss

        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

    return model, vocab


# Train the model with regex-based preprocessing
model, vocab = train_cbow(corpus, hidden_size=50, window_size=2, batch_size=64, epochs=5)


Epoch 0, Loss: 0.0000
Epoch 1, Loss: 0.0000
Epoch 2, Loss: 0.0000
Epoch 3, Loss: 0.0000
Epoch 4, Loss: 0.0000
