## Intro

In [8]:
!pip install evaluate
!pip install jiwer

import nltk
import re
import math
import random
import numpy as np
from nltk.corpus import reuters
from collections import defaultdict, Counter
from typing import List, Tuple, Set
from tqdm import tqdm
from nltk import sent_tokenize, word_tokenize
import heapq
import string
from nltk.tokenize import RegexpTokenizer
import evaluate


nltk.download('reuters')
nltk.download('brown')
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Part 1

We create a class that will contain all the methods necessary for training the model

In [9]:
# PART 1: N-GRAM LANGUAGE MODEL IMPLEMENTATION
# ===========================================

class NGramLanguageModel:
    def __init__(self, n: int, min_freq: int = 10, tokenizer: str = 'nltk'):
        """
        Initialize an n-gram language model.

        Args:
            n: The size of n-grams (2 for bigram, 3 for trigram)
            min_freq: Minimum frequency to include a word in vocabulary
        """
        self.n = n
        self.min_freq = min_freq
        self.tokenizer = tokenizer

        if tokenizer == 'regexp':
          self.regexp_tokenizer = RegexpTokenizer(pattern=r'\w+|\(|\)|\.|\,')

        # Main model components
        self.vocabulary = set()  # Words in the vocabulary
        self.word_counts = Counter()  # Counts of individual words
        self.ngram_counts = defaultdict(Counter)  # Counts of n-grams
        self.context_counts = defaultdict(int)  # Counts of (n-1)-grams (contexts)

        # Model constants
        self.UNK = "<UNK>"  # Out-of-vocabulary token
        self.END = "<end>"  # End of sentence token

        # Different start tokens for different n values
        if n == 2:
            self.START = ["<start>"]
        elif n == 3:
            self.START = ["<start1>", "<start2>"]
        else:
            self.START = [f"<start{i}>" for i in range(1, n)]

        # Statistics
        self.total_sentences = 0
        self.total_tokens = 0
        self.vocabulary_size = 0

    def custom_tokenize(self, text):
            """
            Custom tokenizer using regex to find word tokens.

            Args:
                text: Input text string

            Returns:
                List of tokens
            """
            return re.findall(r"\b\w+\b", text.lower())

    def preprocess_text(self, corpus: List[str]) -> List[List[str]]:
        """
        Preprocess raw text into tokenized sentences.

        Args:
            corpus: List of text passages (paragraphs, documents, etc.)

        Returns:
            List of tokenized sentences
        """
        tokenized_sentences = []

        for text in corpus:
            # First split the text into sentences
            sentences = sent_tokenize(text)

            for sentence in sentences:
                # Clean the sentence
                clean_sentence = sentence.lower().strip()

                # Apply selected tokenizer
                if self.tokenizer == "nltk":
                    tokens = word_tokenize(clean_sentence)
                elif self.tokenizer == "custom":
                    tokens = self.custom_tokenize(clean_sentence)
                elif self.tokenizer == "regexp":
                    tokens = self.regexp_tokenizer.tokenize(clean_sentence)
                else:
                    raise ValueError(f"Unknown tokenizer: {self.tokenizer}")

                # Only add non-empty sentences
                if tokens:
                    tokenized_sentences.append(tokens)

        return tokenized_sentences

    def build_vocabulary(self, tokenized_sentences: List[List[str]]) -> Set[str]:
        """
        Build vocabulary from tokenized sentences based on minimum frequency.

        Args:
            tokenized_sentences: List of tokenized sentences

        Returns:
            Set of vocabulary words
        """
        # Count word occurrences
        word_counter = Counter()
        for sentence in tokenized_sentences:
            word_counter.update(sentence)

        # Create vocabulary with words that meet minimum frequency
        vocabulary = {word for word, count in word_counter.items()
                     if count >= self.min_freq}

        # Always add special tokens to vocabulary
        vocabulary.add(self.UNK)
        vocabulary.add(self.END)
        for token in self.START:
            vocabulary.add(token)

        return vocabulary

    def replace_oov_words(self, tokenized_sentences: List[List[str]]) -> List[List[str]]:
        """
        Replace out-of-vocabulary words with UNK token.

        Args:
            tokenized_sentences: List of tokenized sentences

        Returns:
            List of tokenized sentences with OOV words replaced
        """
        processed_sentences = []

        for sentence in tokenized_sentences:
            processed_sentence = []
            for token in sentence:
                if token in self.vocabulary:
                    processed_sentence.append(token)
                else:
                    processed_sentence.append(self.UNK)
            processed_sentences.append(processed_sentence)

        return processed_sentences

    def extract_ngrams(self, tokenized_sentences: List[List[str]]) -> None:
        """
        Extract n-grams from tokenized sentences and count their occurrences.

        Args:
            tokenized_sentences: List of tokenized sentences with OOV words replaced
        """
        for sentence in tokenized_sentences:
            # Add start and end tokens
            augmented_sentence = self.START + sentence + [self.END]
            self.total_tokens += len(sentence) + 1  # +1 for END token

            # Count individual words (unigrams)
            self.word_counts.update(augmented_sentence)

            # Extract and count n-grams
            for i in range(len(augmented_sentence) - self.n + 1):
                ngram = tuple(augmented_sentence[i:i + self.n])
                prefix = ngram[:-1]  # Context (n-1 gram)
                word = ngram[-1]     # Word being predicted

                self.ngram_counts[prefix][word] += 1
                self.context_counts[prefix] += 1

    def train(self, corpus: List[str]) -> None:
        """
        Train the n-gram language model on the provided corpus.

        Args:
            corpus: List of text passages that may contain multiple sentences
        """
        print(f"Training {self.n}-gram model on corpus...")

        # Preprocess corpus
        tokenized_sentences = self.preprocess_text(corpus)
        self.total_sentences = len(tokenized_sentences)
        print(f"Extracted {self.total_sentences} sentences from corpus")

        # Build vocabulary
        self.vocabulary = self.build_vocabulary(tokenized_sentences)
        self.vocabulary_size = len(self.vocabulary)
        print(f"Vocabulary size: {self.vocabulary_size} words")

        # Replace OOV words
        processed_sentences = self.replace_oov_words(tokenized_sentences)

        # Extract n-grams
        self.extract_ngrams(processed_sentences)

        print(f"Extracted {sum(len(counts) for counts in self.ngram_counts.values())} unique {self.n}-grams")
        print(f"Total tokens in corpus: {self.total_tokens}")

    def get_laplace_probability(self, word: str, context: tuple, alpha: float = 0.01) -> float:
        """
        Calculate Laplace-smoothed probability P(word|context).

        Args:
            word: The word to calculate probability for
            context: The preceding (n-1) words

        Returns:
            The conditional probability P(word|context)
        """
        count_ngram = self.ngram_counts[context][word]
        count_context = self.context_counts[context]

        # Apply Laplace smoothing
        probability = (count_ngram + alpha) / (count_context + alpha * self.vocabulary_size)

        return probability

    def get_log_probability(self, word: str, context: tuple) -> float:
      """
      Calculate log probability log(P(word|context)).

      Args:
          word: The word to calculate probability for
          context: The preceding (n-1) words

      Returns:
          The log probability log(P(word|context))
      """
      probability = self.get_laplace_probability(word, context)
      return math.log2(probability)

    def get_sentence_log_probability(self, sentence: List[str]) -> float:
        """
        Calculate the log probability of a sentence.

        Args:
            sentence: List of tokens in the sentence

        Returns:
            The log probability of the sentence
        """
        # Replace OOV words with UNK
        processed_sentence = [token if token in self.vocabulary else self.UNK for token in sentence]

        # Add start and end tokens
        augmented_sentence = self.START + processed_sentence + [self.END]

        log_prob = 0.0

        # Calculate log probability for each word given its context
        for i in range(len(self.START), len(augmented_sentence)):
            word = augmented_sentence[i]
            context = tuple(augmented_sentence[i - self.n + 1:i])

            log_prob += self.get_log_probability(word, context)

        return log_prob

In [10]:
def load_and_split_corpus(corpus_name='reuters', min_sentences=100000):
    """
    Load and split a corpus from NLTK into train, validation, and test sets.

    Args:
        corpus_name: Name of the corpus to load
        min_sentences: Minimum number of sentences to include

    Returns:
        Tuple of (train_corpus, val_corpus, test_corpus)
    """
    print(f"Loading {corpus_name} corpus...")

    if corpus_name == 'reuters':
        from nltk.corpus import reuters
        sentences = [" ".join(reuters.words(fileid)) for fileid in reuters.fileids()]

        # Break into actual sentences
        all_sentences = []
        for text in sentences:
            all_sentences.extend(nltk.sent_tokenize(text))

    elif corpus_name == 'brown':
        from nltk.corpus import brown
        sentences = [" ".join(brown.words(fileid)) for fileid in brown.fileids()]

        # Break into actual sentences
        all_sentences = []
        for text in sentences:
            all_sentences.extend(nltk.sent_tokenize(text))

    else:
        raise ValueError(f"Unknown corpus: {corpus_name}")

    # Ensure we have enough sentences
    if len(all_sentences) < min_sentences:
        raise ValueError(f"Corpus {corpus_name} has only {len(all_sentences)} sentences, "
                         f"which is less than the required {min_sentences}.")

    # Shuffle sentences
    random.seed(42)
    random.shuffle(all_sentences)

    # Take a subset for faster processing if needed
    sentences_subset = all_sentences[:min_sentences]

    # Split into train, validation, and test sets (70%, 15%, 15%)
    train_size = int(0.7 * len(sentences_subset))
    val_size = int(0.15 * len(sentences_subset))

    train_corpus = sentences_subset[:train_size]
    val_corpus = sentences_subset[train_size:train_size + val_size]
    test_corpus = sentences_subset[train_size + val_size:]

    print(f"Corpus split: {len(train_corpus)} train, {len(val_corpus)} validation, {len(test_corpus)} test sentences")

    return train_corpus, val_corpus, test_corpus

## Part 2

Methods that will be used for calculating the cross entropy and perplexity

In [11]:
# PART 2: CROSS-ENTROPY AND PERPLEXITY EVALUATION
# ==============================================
def calculate_cross_entropy(model: NGramLanguageModel, test_corpus: List[str]) -> Tuple[float, float]:
    """
    Calculate cross-entropy of a language model on a test corpus.

    Args:
        model: Trained language model
        test_corpus: List of test sentences

    Returns:
        Cross-entropy value
    """
    # Preprocess and handle OOV words
    tokenized_sentences = model.preprocess_text(test_corpus)
    processed_sentences = model.replace_oov_words(tokenized_sentences)

    total_log_prob = 0.0
    total_words = 0  # N in cross-entropy formula (include <end> tokens, exclude <start> tokens)

    for sentence in processed_sentences:
        # Add start and end tokens
        augmented_sentence = model.START + sentence + [model.END]

        for i in range(len(model.START), len(augmented_sentence)):
            word = augmented_sentence[i]
            context = tuple(augmented_sentence[i - model.n + 1:i])

            # Get log probability
            log_prob = model.get_log_probability(word, context)

            total_log_prob += log_prob
            total_words += 1

    cross_entropy = - (total_log_prob) / total_words

    return cross_entropy

In [12]:
def calculate_perplexity(cross_entropy: float) -> float:
    """
    Calculate perplexity from cross-entropy.

    Args:
        cross_entropy: Cross-entropy value

    Returns:
        Perplexity value
    """
    return 2 ** cross_entropy

In [13]:
# Part 1
# Load and split corpus
train_corpus, val_corpus, test_corpus = load_and_split_corpus(corpus_name='brown', min_sentences=50000)

# Initialize and train models
bigram_model = NGramLanguageModel(n=2, min_freq=10, tokenizer='nltk')
bigram_model.train(train_corpus)

trigram_model = NGramLanguageModel(n=3, min_freq=10, tokenizer='nltk')
trigram_model.train(train_corpus)

# Ensure both models use the same vocabulary
common_vocab = bigram_model.vocabulary.intersection(trigram_model.vocabulary)
bigram_model.vocabulary = common_vocab
trigram_model.vocabulary = common_vocab
bigram_model.vocabulary_size = len(common_vocab)
trigram_model.vocabulary_size = len(common_vocab)

print(f"Common vocabulary size: {len(common_vocab)}")

# Part 2
# Calculate cross-entropy and perplexity on validation set
print("\nEvaluating on validation set...")

bigram_ce_val = calculate_cross_entropy(bigram_model, val_corpus)
bigram_ppl_val = calculate_perplexity(bigram_ce_val)

trigram_ce_val = calculate_cross_entropy(trigram_model, val_corpus)
trigram_ppl_val = calculate_perplexity(trigram_ce_val)

print(f"Bigram model - Cross-entropy: {bigram_ce_val:.4f}, Perplexity: {bigram_ppl_val:.4f}")
print(f"Trigram model - Cross-entropy: {trigram_ce_val:.4f}, Perplexity: {trigram_ppl_val:.4f}")

# Calculate cross-entropy and perplexity on test set
print("\nEvaluating on test set...")

bigram_ce_test = calculate_cross_entropy(bigram_model, test_corpus)
bigram_ppl_test = calculate_perplexity(bigram_ce_test)

trigram_ce_test = calculate_cross_entropy(trigram_model, test_corpus)
trigram_ppl_test = calculate_perplexity(trigram_ce_test)

print(f"Bigram model - Cross-entropy: {bigram_ce_test:.4f}, Perplexity: {bigram_ppl_test:.4f}")
print(f"Trigram model - Cross-entropy: {trigram_ce_test:.4f}, Perplexity: {trigram_ppl_test:.4f}")

Loading brown corpus...
Corpus split: 35000 train, 7500 validation, 7500 test sentences
Training 2-gram model on corpus...
Extracted 35000 sentences from corpus
Vocabulary size: 6091 words
Extracted 186764 unique 2-grams
Total tokens in corpus: 760914
Training 3-gram model on corpus...
Extracted 35000 sentences from corpus
Vocabulary size: 6092 words
Extracted 459477 unique 3-grams
Total tokens in corpus: 760914
Common vocabulary size: 6090

Evaluating on validation set...
Bigram model - Cross-entropy: 7.3274, Perplexity: 160.6073
Trigram model - Cross-entropy: 9.2053, Perplexity: 590.3171

Evaluating on test set...
Bigram model - Cross-entropy: 7.3027, Perplexity: 157.8820
Trigram model - Cross-entropy: 9.1658, Perplexity: 574.3561


## Part 3

In [14]:
def get_next_word_greedy(model: NGramLanguageModel, context: tuple) -> str:
    """
    Get the most probable next word given the context.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)

    Returns:
        Most probable next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Return the word with the highest probability
    return max(candidates.items(), key=lambda x: x[1])[0]

In [15]:
def get_next_word_topk(model: NGramLanguageModel,
                      context: tuple,
                      k: int = 5,
                      temperature: float = 1.0) -> str:
    """
    Sample next word from top-k most probable words.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)
        k: Number of top candidates to consider
        temperature: Controls randomness (higher = more random)

    Returns:
        Sampled next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Get top-k candidates
    top_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:k]

    # Apply temperature scaling
    if temperature != 1.0:
        probs = np.array([prob for _, prob in top_candidates])
        probs = np.power(probs, 1.0 / temperature)
        probs = probs / np.sum(probs)
    else:
        probs = np.array([prob for _, prob in top_candidates])
        probs = probs / np.sum(probs)

    # Sample from the distribution
    words = [word for word, _ in top_candidates]
    next_word = np.random.choice(words, p=probs)

    return next_word

In [16]:
def beam_search(model: NGramLanguageModel,
               prompt: List[str],
               beam_width: int = 5,
               max_length: int = 20) -> List[List[str]]:
    """
    Beam search for text generation.

    Args:
        model: Trained language model
        prompt: Initial words to continue from
        beam_width: Beam width
        max_length: Maximum length of the generated sequence

    Returns:
        List of generated sequences (beams)
    """
    # Process the prompt
    processed_prompt = [word if word in model.vocabulary else model.UNK for word in prompt]

    # Initialize beams with start tokens + prompt
    initial_sequence = model.START + processed_prompt
    beams = [(initial_sequence, 0.0)]  # (sequence, log_prob)

    # Generate for max_length steps
    for _ in range(max_length):
        new_beams = []

        # Expand each beam
        for sequence, score in beams:
            # If the sequence ended, keep it as is
            if sequence[-1] == model.END:
                new_beams.append((sequence, score))
                continue

            # Get context
            context = tuple(sequence[-(model.n - 1):])

            # Calculate probabilities for all possible next words
            candidates = {}
            for word in model.vocabulary:
                # Skip UNK token for generation
                if word == model.UNK:
                    continue

                log_prob = model.get_log_probability(word, context)
                candidates[word] = log_prob

            # Get top candidates
            top_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:beam_width]

            # Create new beams with expanded sequences
            for word, log_prob in top_candidates:
                new_sequence = sequence + [word]
                new_score = score + log_prob
                new_beams.append((new_sequence, new_score))

        # Select top beams
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

        # Check if all beams have ended
        if all(sequence[-1] == model.END for sequence, _ in beams):
            break

    # Return only the newly generated parts (excluding start tokens and prompt)
    start_len = len(model.START) + len(processed_prompt)
    return [sequence[start_len:] for sequence, _ in beams]

In [17]:
def generate_text(model: NGramLanguageModel,
                 prompt: List[str],
                 max_length: int = 20,
                 method: str = "greedy",
                 top_k: int = 5,
                 temperature: float = 1.0) -> List[str]:
    """
    Generate text continuation based on the prompt.

    Args:
        model: Trained language model
        prompt: Initial words to continue from
        max_length: Maximum length of the generated sequence
        method: Generation method - "greedy", "topk", or "nucleus"
        top_k: Number of top candidates to consider for sampling
        temperature: Controls randomness (higher = more random)

    Returns:
        List of words completing the prompt
    """
    # Process the prompt
    processed_prompt = [word if word in model.vocabulary else model.UNK for word in prompt]

    # Initialize with start tokens + prompt
    generated_text = model.START + processed_prompt

    # Generate text until we reach max_length or end token
    for _ in range(max_length):
        # Get the most recent (n-1) words as context
        context = tuple(generated_text[-(model.n - 1):])

        # Get next word based on the specified method
        if method == "greedy":
            next_word = get_next_word_greedy(model, context)
        elif method == "topk":
            next_word = get_next_word_topk(model, context, top_k, temperature)
        else:
            raise ValueError(f"Unknown generation method: {method}")

        # Add the generated word to the sequence
        generated_text.append(next_word)

        # Stop if we generated the end token
        if next_word == model.END:
            break

    # Return only the newly generated part (excluding start tokens and prompt)
    return generated_text[len(model.START) + len(processed_prompt):]

In [19]:
# Part 3
# Generate text completions
print("\nGenerating text completions:")

prompts = [
"I would like to commend the",
"The president of",
"According to recent",
"In the last few",
"Experts say that"
]

print("\nBigram model completions:")
for prompt in prompts:
  if bigram_model.tokenizer == "nltk":
    prompt_tokens = nltk.word_tokenize(prompt.lower())
  elif bigram_model.tokenizer == "custom":
    prompt_tokens = bigram_model.custom_tokenize(prompt)
  elif bigram_model.tokenizer=="regexp":
    prompt_tokens = bigram_model.regexp_tokenizer.tokenize(prompt.lower())

  # Generate with greedy decoding
  completion_greedy = generate_text(bigram_model, prompt_tokens, method="greedy")
  completion_text_greedy = prompt + " " + " ".join([w for w in completion_greedy if w != bigram_model.END])
  print(f"[Greedy] {completion_text_greedy}")

  # Generate with top-k sampling
  completion_topk = generate_text(bigram_model, prompt_tokens, method="topk", top_k=5, temperature=1.5)
  completion_text_topk = prompt + " " + " ".join([w for w in completion_topk if w != bigram_model.END])
  print(f"[Top-K] {completion_text_topk}")

  # Generate with beam search
  beam_completions = beam_search(bigram_model, prompt_tokens, beam_width=3)
  top_beam = beam_completions[0]
  completion_text_beam = prompt + " " + " ".join([w for w in top_beam if w != bigram_model.END])
  print(f"[Beam] {completion_text_beam}")
  print()

print("\nTrigram model completions:")
for prompt in prompts:
  if trigram_model.tokenizer == "nltk":
      prompt_tokens = nltk.word_tokenize(prompt.lower())
  elif trigram_model.tokenizer == "custom":
      prompt_tokens = bigram_model.custom_tokenize(prompt)
  elif trigram_model.tokenizer=="regexp":
    prompt_tokens = trigram_model.regexp_tokenizer.tokenize(prompt.lower())

  # Generate with greedy decoding
  completion_greedy = generate_text(trigram_model, prompt_tokens, method="greedy")
  completion_text_greedy = prompt + " " + " ".join([w for w in completion_greedy if w != trigram_model.END])
  print(f"[Greedy] {completion_text_greedy}")

  # Generate with top-k sampling
  completion_topk = generate_text(trigram_model, prompt_tokens, method="topk", top_k=5, temperature=1.5)
  completion_text_topk = prompt + " " + " ".join([w for w in completion_topk if w != trigram_model.END])
  print(f"[Top-K] {completion_text_topk}")

  # Generate with beam search
  beam_completions = beam_search(trigram_model, prompt_tokens, beam_width=3)
  top_beam = beam_completions[0]
  completion_text_beam = prompt + " " + " ".join([w for w in top_beam if w != trigram_model.END])
  print(f"[Beam] {completion_text_beam}")
  print()


Generating text completions:

Bigram model completions:
[Greedy] I would like to commend the same time , and the same time , and the same time , and the same time , and the
[Top-K] I would like to commend the other side in the first , and i had a little more than a good .
[Beam] I would like to commend the `` .

[Greedy] The president of the same time , and the same time , and the same time , and the same time , and
[Top-K] The president of the other , the first time of the first two years ago .
[Beam] The president of the `` .

[Greedy] According to recent years , and the same time , and the same time , and the same time , and the same
[Top-K] According to recent history .
[Beam] According to recent years .

[Greedy] In the last few days , and the same time , and the same time , and the same time , and the same
[Top-K] In the last few days in the most of the first time .
[Beam] In the last few days .

[Greedy] Experts say that the same time , and the same time , and the same time , a

## Part 4

### _Context- Aware Spelling Correction using Beam Search_

The goal is to implement a context-aware spelling corrector for noisy sentences.
The corrector should:
  * Use an **N-gram Language Model** to evaluate how natural candidate corrections are given the context.
  * Use an **error model** based on **edit distance** to penalize corrections that are far from the original noisy token.
  * Use **beam search** to explore multiple possible corrections at each step, keeping only the most promising sequences.
  * Implement an option to **proserve tokens already in the vocabulary** (`skip_oov=True`), without generating unecessary candidates.
  * Allow **verbose control**:
    - If `verbose=True`, log detailed steps per token (candidates, scores, best selections).
    - If `verbose=False`, do not show any logging, return only the corrected tokens.

We firstly initialize our Context Aware Spelling Corrector class

In [None]:
class ContextAwareSpellingCorrector:
    """
    A context-aware spelling corrector using beam search,
    organized as a class without instance variables.
    """

    def log_prob(self, p: float) -> float:
        """
        Compute log-probability with safe handling for zero.
        """
        return math.log(p) if p > 0 else float('-inf')

    def calculate_lm_score(self, candidate: str, context: Tuple[str, ...], model) -> float:
        """
        Get the language model log-probability of a candidate given context.
        """
        return model.get_log_probability(candidate, context)

    def calculate_error_score(self, noisy_token: str, candidate: str) -> float:
        """
        Compute the log-probability of a candidate based on its edit distance.
        """
        edit_dist = nltk.edit_distance(noisy_token, candidate)
        return self.log_prob(1 / (edit_dist + 1))

    def combine_scores(self, lm_score: float, error_score: float, lambda_lm: float = 0.8, lambda_err: float = 0.2) -> float:
        """
        Combine language model and error model scores using weighted sum.
        """
        return lambda_lm * lm_score + lambda_err * error_score

    def generate_candidates(
        self,
        noisy_token: str,
        vocabulary: Set[str],
        max_edit_distance: int = 2,
        skip_oov: bool = True
    ) -> List[str]:
        """
        Generate candidate corrections within a max edit distance.
        """
        if skip_oov and noisy_token in vocabulary:
            return [noisy_token]

        candidates = []
        for word in vocabulary:
            if word == "<UNK>":
                continue
            if nltk.edit_distance(noisy_token, word) <= max_edit_distance:
                candidates.append(word)

        return candidates or [noisy_token]

    def beam_search_step(
        self,
        beams: List[Tuple[List[str], float]],
        noisy_token: str,
        model,
        vocabulary: Set[str],
        beam_width: int = 5,
        lambda_lm: float = 0.8,
        lambda_err: float = 0.2,
        max_edit_distance: int = 2,
        skip_oov: bool = True,
        verbose: bool = True
    ) -> List[Tuple[List[str], float]]:
        """
        Expand beam sequences with possible corrections for the next token.
        """
        new_beams = []
        candidate_info = []

        for sequence, score in beams:
            context = tuple(sequence[-(model.n - 1):])
            candidates = self.generate_candidates(noisy_token, vocabulary, max_edit_distance, skip_oov)

            for candidate in candidates:
                lm_score = self.calculate_lm_score(candidate, context, model)
                err_score = self.calculate_error_score(noisy_token, candidate)
                total_score = self.combine_scores(lm_score, err_score, lambda_lm, lambda_err)

                candidate_info.append((candidate, lm_score, err_score, total_score))

                new_sequence = sequence + [candidate]
                new_score = score + total_score
                new_beams.append((new_sequence, new_score))

        if not new_beams:
            if verbose:
                print(f"Warning: No valid candidates for '{noisy_token}'. Using fallback.")
            for sequence, score in beams:
                new_sequence = sequence + [noisy_token]
                new_beams.append((new_sequence, score))

        top_beams = heapq.nlargest(beam_width, new_beams, key=lambda x: x[1])
        top_tokens = [beam[0][-1] for beam in top_beams]

        if verbose:
            print(f"\nToken: '{noisy_token}' | Context: {context}")
            print("Top candidates:")
            print(f"{'Candidate':<15} {'LM Score':>10} {'Error Score':>12} {'Combined':>12}")
            print("-" * 55)

            for token in top_tokens:
                for cand, lm, err, combined in candidate_info:
                    if cand == token:
                        print(f"{cand:<15} {lm:>+10.4f} {err:>+12.4f} {combined:>+12.4f}")
                        break

            print(f"Best candidate selected: {top_tokens[0]}")
            print("-" * 55)

        return top_beams

    def correct(
        self,
        model,
        noisy_sentence: List[str],
        beam_width: int = 5,
        lambda_lm: float = 0.8,
        lambda_err: float = 0.2,
        max_edit_distance: int = 2,
        skip_oov: bool = True,
        verbose: bool = True
    ) -> List[str]:
        """
        Perform context-aware spelling correction on a noisy sentence using beam search.
        """
        if verbose:
            print(f"\nStarting correction for sentence: {' '.join(noisy_sentence)}\n")

        beams = [(model.START, 0.0)]

        for noisy_token in noisy_sentence:
            if skip_oov and noisy_token in model.vocabulary:
                if verbose:
                    print(f"\nToken: '{noisy_token}' (in vocabulary, skipping correction)")
                new_beams = []
                for sequence, score in beams:
                    new_sequence = sequence + [noisy_token]
                    new_beams.append((new_sequence, score))
                beams = new_beams
                continue

            beams = self.beam_search_step(
                beams, noisy_token, model, model.vocabulary,
                beam_width, lambda_lm, lambda_err, max_edit_distance, skip_oov, verbose
            )

        best_sequence = max(beams, key=lambda x: x[1])[0]
        corrected = best_sequence[len(model.START):]

        if verbose:
            print(f"\nFinal corrected sentence: {' '.join(corrected)}")

        return corrected

We initialize our bigram and trigram models using the brown corpus

Then we test our `context aware spelling corrector` for both models using some random test sentences:

In [None]:
test_sentences = [
    "let us sai we are freends",
    "in consequencaae of her sistero's marriange, been moistress of hois house from a vry early period",
    "Tomorrrow well bring somethiing new, so liv today as a memoory."
]

First, we are going to use the other two tokenizers, just to see the results, we start with the regexp

In [None]:
# Initialize and train models
bigram_model_regexp = NGramLanguageModel(n=2, min_freq=10, tokenizer='regexp')
bigram_model_regexp.train(train_corpus)

trigram_model_regexp = NGramLanguageModel(n=3, min_freq=10, tokenizer='regexp')
trigram_model_regexp.train(train_corpus)

Training 2-gram model on corpus...
Extracted 33081 sentences from corpus
Vocabulary size: 6163 words
Extracted 189598 unique 2-grams
Total tokens in corpus: 746345
Training 3-gram model on corpus...
Extracted 33081 sentences from corpus
Vocabulary size: 6164 words
Extracted 461488 unique 3-grams
Total tokens in corpus: 746345


In [None]:
corrector = ContextAwareSpellingCorrector()

for sentence in test_sentences:
    corrected = corrector.correct(
        bigram_model_regexp,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.8,
        lambda_err=0.2,
        skip_oov=True
    )
    print()


Starting correction for sentence: let us sai we are freends


Token: 'let' (in vocabulary, skipping correction)

Token: 'us' (in vocabulary, skipping correction)

Token: 'sai' | Context: ('us',)
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
a                  -5.4177      -1.0986      -4.5539
as                 -6.2911      -1.0986      -5.2526
an                 -6.8748      -1.0986      -5.7196
say                -7.2887      -0.6931      -5.9696
was                -7.2887      -1.0986      -6.0507
Best candidate selected: a
-------------------------------------------------------

Token: 'we' (in vocabulary, skipping correction)

Token: 'are' (in vocabulary, skipping correction)

Token: 'freends' | Context: ('are',)
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
freed             -11.4242      -1.0986      -9.3591
freed         

In [None]:
corrector = ContextAwareSpellingCorrector()

for sentence in test_sentences:
    corrected = corrector.correct(
        trigram_model_regexp,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.8,
        lambda_err=0.2,
        skip_oov=True
    )
    print()


Starting correction for sentence: let us sai we are freends


Token: 'let' (in vocabulary, skipping correction)

Token: 'us' (in vocabulary, skipping correction)

Token: 'sai' | Context: ('let', 'us')
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
say                -5.6315      -0.6931      -4.6438
see                -5.6315      -1.0986      -4.7249
saw               -13.2825      -0.6931     -10.7646
sat               -13.2825      -0.6931     -10.7646
said              -13.2825      -0.6931     -10.7646
Best candidate selected: say
-------------------------------------------------------

Token: 'we' (in vocabulary, skipping correction)

Token: 'are' (in vocabulary, skipping correction)

Token: 'freends' | Context: ('we', 'are')
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
friends           -14.2110      -0.6931     -11.5074
t

And now we also use the custom tokenizer (that skips punctuation)

In [None]:
# Initialize and train models
bigram_model_custom = NGramLanguageModel(n=2, min_freq=10, tokenizer='custom')
bigram_model_custom.train(train_corpus)

trigram_model_custom = NGramLanguageModel(n=3, min_freq=10, tokenizer='custom')
trigram_model_custom.train(train_corpus)

In [None]:
corrector = ContextAwareSpellingCorrector()

for sentence in test_sentences:
    corrected = corrector.correct(
        bigram_model_custom,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.8,
        lambda_err=0.2,
        skip_oov=True
    )
    print()

In [None]:
corrector = ContextAwareSpellingCorrector()

for sentence in test_sentences:
    corrected = corrector.correct(
        trigram_model_custom,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.8,
        lambda_err=0.2,
        skip_oov=True
    )
    print()


Starting correction for sentence: let us sai we are freends


Token: 'let' (in vocabulary, skipping correction)

Token: 'us' (in vocabulary, skipping correction)

Token: 'sai' | Context: ('let', 'us')
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
say                -5.6210      -0.6931      -4.6354
see                -5.6210      -1.0986      -4.7165
saw               -13.2720      -0.6931     -10.7563
sat               -13.2720      -0.6931     -10.7563
said              -13.2720      -0.6931     -10.7563
Best candidate selected: say
-------------------------------------------------------

Token: 'we' (in vocabulary, skipping correction)

Token: 'are' (in vocabulary, skipping correction)

Token: 'freends' | Context: ('we', 'are')
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
friends           -14.2207      -0.6931     -11.5152
t

In [None]:
corrector = ContextAwareSpellingCorrector()

for sentence in test_sentences:
    corrected = corrector.correct(
        bigram_model,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.8,
        lambda_err=0.2,
        skip_oov=True
    )
    print()


Starting correction for sentence: let us sai we are freends


Token: 'let' (in vocabulary, skipping correction)

Token: 'us' (in vocabulary, skipping correction)

Token: 'sai' | Context: ('us',)
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
a                  -5.4155      -1.0986      -4.5521
as                 -6.2888      -1.0986      -5.2508
an                 -6.8726      -1.0986      -5.7178
say                -7.2864      -0.6931      -5.9678
was                -7.2864      -1.0986      -6.0489
Best candidate selected: a
-------------------------------------------------------

Token: 'we' (in vocabulary, skipping correction)

Token: 'are' (in vocabulary, skipping correction)

Token: 'freends' | Context: ('are',)
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
freed             -11.4363      -1.0986      -9.3688
freed         

In [None]:
corrector = ContextAwareSpellingCorrector()

for sentence in test_sentences:
    corrected = corrector.correct(
        trigram_model,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.8,
        lambda_err=0.2,
        skip_oov=True
    )
    print()


Starting correction for sentence: let us sai we are freends


Token: 'let' (in vocabulary, skipping correction)

Token: 'us' (in vocabulary, skipping correction)

Token: 'sai' | Context: ('let', 'us')
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
say                -5.6207      -0.6931      -4.6352
see                -5.6207      -1.0986      -4.7163
sad               -13.2718      -0.6931     -10.7560
sat               -13.2718      -0.6931     -10.7560
san               -13.2718      -0.6931     -10.7560
Best candidate selected: say
-------------------------------------------------------

Token: 'we' (in vocabulary, skipping correction)

Token: 'are' (in vocabulary, skipping correction)

Token: 'freends' | Context: ('we', 'are')
Top candidates:
Candidate         LM Score  Error Score     Combined
-------------------------------------------------------
friends           -14.2205      -0.6931     -11.5151
f

### _Observations_

We tested the corrector using vocabularies from models using all possible tokenizers. We prefer the results produced by the nltk tokenizer, so we will continue the analysis using these bigram and trigram models.

1. **Correction Accuracy**
   * The **trigram model** generally selects better context-aware corrections compared to the **bigram model**.
   * In the bigram model, some corrections make sense locally but fail to match the overall sentence meaning.
   * With the trigram model, corrections like "sey" → "see" are more appropriate, because the model can consider two preceding words instead of only one, leading to more grammatically and semantically correct outputs.

2. **Role of Context**
   * In the bigram model, only the immediate previous word is available to predict the next word. This can cause the model to pick a word that fits the local pair but not the broader sentence.
   * The trigram model, by using two preceding words, captures a richer context, helping it disambiguate between candidates that might otherwise look equally likely based on only one previous word.

3. **Candidate Scoring**
   * When examining the top candidates:
     * The correct words often have better **combined scores** (language model + error model) in the trigram case.
     * Even if multiple candidates have close edit distances (error model scores), the **language model score** can now differentiate better because of the stronger context window.
   * This difference highlights how using a richer n-gram context improves the model's ability to rank correct candidates higher.

4. **Stability Across Runs**
   * Even though specific score values (LM, error, combined) can change slightly across different training runs or random seeds, the overall behavior remains consistent:
     * The trigram model is more **stable** in choosing the most logical correction.
     * The bigram model shows more **variability** and occasional incorrect corrections.

5. **Conclusion**
   * Moving from a bigram to a trigram model significantly improves the spelling correction performance by leveraging additional context, allowing the system to make more informed and globally consistent decisions about candidate words. The improvement is mainly due to better language model probabilities rather than changes in edit distances.

## Part 5

### _Artificial Test Dataset Creation_

In this section we implement a class that generates an **artificially corrupted version** of the test corpus. This is necessary in order to stimulate real-world noisy inputs and evaluate the performance of the context-aware spelling corrector.

We firstly declare the class we will use

In [None]:
class ArtificialTestDataset:
    def __init__(self, sentences, error_prob=0.05, seed=None):
        """
        Initialize the artificial test dataset generator.

        Args:
            sentences (List[str]): List of clean sentences to corrupt.
            error_prob (float): Probability of replacing each non-space character.
            seed (int, optional): Random seed for reproducibility.
        """
        self.sentences = sentences
        self.error_prob = error_prob
        if seed is not None:
            random.seed(seed)
        # Character set for random replacements (excluding space)
        self.chars = list(string.ascii_letters + string.digits + string.punctuation)

    def _corrupt_char(self, c):
        # Do not corrupt whitespace; apply corruption with given probability
        if c.isspace() or random.random() > self.error_prob:
            return c
        # Choose a random replacement different from the original
        replacement = random.choice(self.chars)
        while replacement == c:
            replacement = random.choice(self.chars)
        return replacement

    def generate(self):
        """
        Generate the corrupted dataset.

        Returns:
            List[str]: Corrupted sentences.
        """
        corrupted = []
        for sentence in self.sentences:
            corrupted_sentence = ''.join(self._corrupt_char(c) for c in sentence)
            corrupted.append(corrupted_sentence)
        return corrupted

And then we test our generator

In [None]:
# Now, with test_corpus loaded, generate the corrupted dataset and display samples:
generator = ArtificialTestDataset(test_corpus, error_prob=0.05, seed=42)
corrupted_test_corpus = generator.generate()

print("Showing original vs. corrupted for first 5 sentences:")
print("=" * 60)
for orig, corrupt in zip(test_corpus[:5], corrupted_test_corpus[:5]):
    print(f"Original:  {orig}")
    print(f"Corrupted: {corrupt}")
    print("-" * 60)

Showing original vs. corrupted for first 5 sentences:
Original:  O beautiful for patriot dream that sees beyond the years thine alabaster cities gleam undimmed by human tears .
Corrupted: O Jeautifll fzr patriot dream that 6ees beyond Dhe years thine alabaster cities gleam undimmed by human tears .
------------------------------------------------------------
Original:  ( cf.
Corrupted: ( cf.
------------------------------------------------------------
Original:  The Village office of Western Union with George Towsley as manager and telegrapher continued in Hard's drugstore until 1905 .
Corrupted: The Villag} office of WesterP Union with George Towsley as manager )nd telegrapher continued in Hard's drugstore until 1905 .
------------------------------------------------------------
Original:  As if this was a signal , Poet abruptly began to thrash the water and the quick movement slowly made them sink through the water .
Corrupted: As if this was a signal , Poet abruptly began to thrash 

## Part 6
Evaluation of the context-aware spelling corrector in terms of Word Error Rate (WER) and character Error Rate (CER)

* **Character Error Rate (CER)** and **Word Error Rate (WER)** are metrics that measure the performance of the context aware spelling corrector by calculating the rate of erroneous characters produced by the system compared to the ground truth and its accuracy at the word level by measuring the proportion of incorrectly recognized words relative to the reference text.
* Both are derived from the Levenshtein (edit) distance with values typically ranging from 0 to 1, where 0 indicates perfect alignment of the system output to the ground truth anf 1 indicates total dissimilarity between the compared pieces of text. If the score is larger than 1 we assume that the prediction is worse than a complete mismatch, with more actions required (deletion, insertion, substitution) than reference words.
* WER is defined from the Levenshtein distance normalised by the sentence length: $$WER=\frac{S+D+I}{N}$$ where
  - S: number of substitutions
  - D: number of deletions
  - I: number of insertions
  - N: number of words in reference text
* CER is defined from the Levenshtein distance normalised by the sentence length: $$CER=\frac{S+D+I}{n}$$ where
  - n: number of characters in reference text

In [None]:
wer_metric = evaluate.load("wer")  # Load WER metric
cer_metric = evaluate.load("cer")  # Load CER metric

* For Bigram Language Model

In [None]:
generator = ArtificialTestDataset(test_corpus, error_prob=0.05, seed=42)
corrupted_test_corpus = generator.generate()

reference = test_corpus[:100] # Ground truth (list)
hypothesis = corrupted_test_corpus # Noisy sentences (list)

reference_sentences_tokenised = []
corrupted_sentences_tokenised = []
corrected_sentences_tokenised = []

corrector = ContextAwareSpellingCorrector()

for sentence in hypothesis[:100]:
    test_tokens = nltk.word_tokenize(sentence)
    corrupted_sentences_tokenised.append(sentence)
    corrected_sentences_tokenised.append(" ".join(
        corrector.correct(bigram_model,
            test_tokens,
            beam_width=5,
            lambda_lm=0.8,
            lambda_err=0.2,
            skip_oov=True,
            verbose=False)))

for sentence in reference:
    reference_sentences_tokenised.append(nltk.word_tokenize(sentence))

test_data = list(zip(reference_sentences_tokenised, corrupted_sentences_tokenised))

In [None]:
wer_score = wer_metric.compute(references=reference, predictions=corrected_sentences_tokenised)
cer_score = cer_metric.compute(references=reference, predictions=corrected_sentences_tokenised)

print(f"WER: {wer_score:.4f}")
print(f"CER: {cer_score:.4f}")

WER: 0.3369
CER: 0.1024


* For Trigram Language Model

In [None]:
generator = ArtificialTestDataset(test_corpus, error_prob=0.05, seed=42)
corrupted_test_corpus = generator.generate()

reference = test_corpus[:100] # Ground truth (list)
hypothesis = corrupted_test_corpus # Noisy (list)

reference_sentences_tokenised = []
corrupted_sentences_tokenised = []
corrected_sentences_tokenised = []

corrector = ContextAwareSpellingCorrector()

for sentence in hypothesis[:100]:
    test_tokens = nltk.word_tokenize(sentence)
    corrupted_sentences_tokenised.append(sentence)
    corrected_sentences_tokenised.append(" ".join(
        corrector.correct(bigram_model,
            test_tokens,
            beam_width=5,
            lambda_lm=0.8,
            lambda_err=0.2,
            skip_oov=True,
            verbose=False)))

for sentence in reference:
    reference_sentences_tokenised.append(nltk.word_tokenize(sentence))

test_data = list(zip(reference_sentences_tokenised, corrupted_sentences_tokenised))

In [None]:
wer_score = wer_metric.compute(references=reference, predictions=corrected_sentences_tokenised)
cer_score = cer_metric.compute(references=reference, predictions=corrected_sentences_tokenised)

print(f"WER: {wer_score:.4f}")
print(f"CER: {cer_score:.4f}")

WER: 0.3369
CER: 0.1024


### Implementation of the Evaluation Class

The SpellingCorrectionEvaluator class encapsulates the whole pipeline which performs the evaluation of a context-aware spelling corrector (passed as a callable object argument) on artificially corrupted test data (generator method from ArtificialTestDataset instance).

Steps:
1. Takes in a list of ground truth (clean) sentences to use as references.
2. Initializes an ArtificialTestDataset instance to generate noisy versions of the reference sentences by introducing random errors with a given probability (error_prob).
3. Applies a context-aware spelling corrector (e.g., beam search using a trained n-gram model) on each corrupted sentence to produce a corrected hypothesis.
4. Computes Word Error Rate (WER) and Character Error Rate (CER) by comparing the corrected hypotheses against the original clean references.
5. Returns the WER and CER scores as evaluation results

In [None]:
class SpellingCorrectionExperiment:
    """
        Evaluation of the spelling_corrector_function performance on artificially corrupted data.

         Args:
             test_corpus (list): List of ground truth (reference) sentences.
             spelling_corrector_function (callable): Function for context-aware spelling correction.
             language_model: Trained language model passed to the corrector.
             error_prob (float): Probability of introducing an error in the test sentences.
             seed (int): Random seed for reproducibility.
    """
    def __init__(self,
                 test_corpus: list[str],
                 spelling_corrector,
                 language_model,
                 error_prob: float =0.05,
                 seed:int = 42,
                 beam_width:int = 5):
        self.reference_sentences = test_corpus
        self.spelling_corrector = spelling_corrector
        self.language_model = language_model
        self.error_prob = error_prob
        self.seed = seed
        self.beam_width = beam_width

        self.wer_metric = evaluate.load("wer")
        self.cer_metric = evaluate.load("cer")

    def run(self):
        # First generate corrupted versions of the sentences
        generator = ArtificialTestDataset(self.reference_sentences, error_prob=self.error_prob, seed=self.seed)
        corrupted_sentences = generator.generate()

        # Second,  tokenize reference and corrupted sentences
        corrupted_tokenized = [nltk.word_tokenize(sentence) for sentence in corrupted_sentences]

        # Third, apply correction on the corrupted sentences and show progress bar
        corrected_sentences = []
        for tokens in tqdm(corrupted_tokenized, desc="Correcting Sentences"):
            corrected_tokens = self.spelling_corrector.correct(
                self.language_model,
                tokens,
                beam_width=self.beam_width,  # Beam width passed here
                lambda_lm=0.8,
                lambda_err=0.2,
                skip_oov=True,
                verbose=False
            )
            corrected_sentences.append(" ".join(corrected_tokens))

        # Fourth, evaluate the metrics
        wer_score = self.wer_metric.compute(references=self.reference_sentences, predictions=corrected_sentences)
        cer_score = self.cer_metric.compute(references=self.reference_sentences, predictions=corrected_sentences)

        return wer_score, cer_score

In [None]:
spelling_corrector = ContextAwareSpellingCorrector()

# Run the evaluation experiment on a sample of 2000 sentences from the test corpus using the Bigram Model
experiment = SpellingCorrectionExperiment(test_corpus[:2000], spelling_corrector, bigram_model)
wer, cer = experiment.run()
print(f"WER: {wer:.4f}")
print(f"CER: {cer:.4f}")

Correcting Sentences: 100%|██████████| 2000/2000 [4:36:46<00:00,  8.30s/it]    


WER: 0.3352
CER: 0.1002


In [None]:
# Run the evaluation experiment on a sample of 2000 sentences from the test corpus using Trigram Model
experiment = SpellingCorrectionExperiment(test_corpus[:2000], corrector, trigram_model)
wer, cer = experiment.run()
print(f"WER: {wer:.4f}")
print(f"CER: {cer:.4f}")

Correcting Sentences: 100%|██████████| 2000/2000 [4:25:58<00:00,  7.98s/it]    


WER: 0.3336
CER: 0.0933
