## Intro

In [166]:
import nltk
import re
import math
import random
import numpy as np
from nltk.corpus import reuters
from collections import defaultdict, Counter
from typing import List, Tuple, Dict, Set, Callable
from tqdm import tqdm
from nltk.metrics.distance import edit_distance
import heapq
import string

nltk.download('reuters')
nltk.download('brown')
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Part 1

In [116]:
# PART 1: N-GRAM LANGUAGE MODEL IMPLEMENTATION
# ===========================================

class NGramLanguageModel:
    def __init__(self, n: int, min_freq: int = 10):
        """
        Initialize an n-gram language model.

        Args:
            n: The size of n-grams (2 for bigram, 3 for trigram)
            min_freq: Minimum frequency to include a word in vocabulary
        """
        self.n = n
        self.min_freq = min_freq

        # Main model components
        self.vocabulary = set()  # Words in the vocabulary
        self.word_counts = Counter()  # Counts of individual words
        self.ngram_counts = defaultdict(Counter)  # Counts of n-grams
        self.context_counts = defaultdict(int)  # Counts of (n-1)-grams (contexts)

        # Model constants
        self.UNK = "<UNK>"  # Out-of-vocabulary token
        self.END = "<end>"  # End of sentence token

        # Different start tokens for different n values
        if n == 2:
            self.START = ["<start>"]
        elif n == 3:
            self.START = ["<start1>", "<start2>"]
        else:
            self.START = [f"<start{i}>" for i in range(1, n)]

        # Statistics
        self.total_sentences = 0
        self.total_tokens = 0
        self.vocabulary_size = 0

    def preprocess_text(self, sentences: List[str]) -> List[List[str]]:
        """
        Preprocess raw sentences into tokenized form.

        Args:
            sentences: List of raw text sentences

        Returns:
            List of tokenized sentences
        """
        tokenized_sentences = []

        for sentence in sentences:
            # Clean and tokenize the sentence
            clean_sentence = sentence.lower().strip()
            tokens = nltk.word_tokenize(clean_sentence)
            tokenized_sentences.append(tokens)

        return tokenized_sentences

    def build_vocabulary(self, tokenized_sentences: List[List[str]]) -> Set[str]:
        """
        Build vocabulary from tokenized sentences based on minimum frequency.

        Args:
            tokenized_sentences: List of tokenized sentences

        Returns:
            Set of vocabulary words
        """
        # Count word occurrences
        word_counter = Counter()
        for sentence in tokenized_sentences:
            word_counter.update(sentence)

        # Create vocabulary with words that meet minimum frequency
        vocabulary = {word for word, count in word_counter.items()
                     if count >= self.min_freq}

        # # Always add special tokens to vocabulary
        vocabulary.add(self.UNK)
        vocabulary.add(self.END)
        for token in self.START:
            vocabulary.add(token)

        return vocabulary

    def replace_oov_words(self, tokenized_sentences: List[List[str]]) -> List[List[str]]:
        """
        Replace out-of-vocabulary words with UNK token.

        Args:
            tokenized_sentences: List of tokenized sentences

        Returns:
            List of tokenized sentences with OOV words replaced
        """
        processed_sentences = []

        for sentence in tokenized_sentences:
            processed_sentence = []
            for token in sentence:
                if token in self.vocabulary:
                    processed_sentence.append(token)
                else:
                    processed_sentence.append(self.UNK)
            processed_sentences.append(processed_sentence)

        return processed_sentences

    def extract_ngrams(self, tokenized_sentences: List[List[str]]) -> None:
        """
        Extract n-grams from tokenized sentences and count their occurrences.

        Args:
            tokenized_sentences: List of tokenized sentences with OOV words replaced
        """
        for sentence in tokenized_sentences:
            # Add start and end tokens
            augmented_sentence = self.START + sentence + [self.END]
            self.total_tokens += len(sentence) + 1  # +1 for END token

            # Count individual words (unigrams)
            self.word_counts.update(augmented_sentence)

            # Extract and count n-grams
            for i in range(len(augmented_sentence) - self.n + 1):
                ngram = tuple(augmented_sentence[i:i + self.n])
                prefix = ngram[:-1]  # Context (n-1 gram)
                word = ngram[-1]     # Word being predicted

                self.ngram_counts[prefix][word] += 1
                self.context_counts[prefix] += 1

    def train(self, corpus: List[str]) -> None:
        """
        Train the n-gram language model on the provided corpus.

        Args:
            corpus: List of sentences
        """
        self.total_sentences = len(corpus)
        print(f"Training {self.n}-gram model on {self.total_sentences} sentences...")

        # Preprocess the corpus
        tokenized_sentences = self.preprocess_text(corpus)

        # Build vocabulary
        self.vocabulary = self.build_vocabulary(tokenized_sentences)
        self.vocabulary_size = len(self.vocabulary)
        print(f"Vocabulary size: {self.vocabulary_size} words")

        # Replace OOV words
        processed_sentences = self.replace_oov_words(tokenized_sentences)

        # Extract n-grams
        self.extract_ngrams(processed_sentences)

        print(f"Extracted {sum(len(counts) for counts in self.ngram_counts.values())} unique {self.n}-grams")
        print(f"Total tokens in corpus: {self.total_tokens}")

    def get_laplace_probability(self, word: str, context: tuple) -> float:
        """
        Calculate Laplace-smoothed probability P(word|context).

        Args:
            word: The word to calculate probability for
            context: The preceding (n-1) words

        Returns:
            The conditional probability P(word|context)
        """
        # Get counts with Laplace smoothing
        count_ngram = self.ngram_counts[context][word]
        count_context = self.context_counts[context]

        # Apply Laplace smoothing (+1 to numerator, +V to denominator)
        probability = (count_ngram + 1) / (count_context + self.vocabulary_size)

        return probability

    def get_log_probability(self, word: str, context: tuple) -> float:
        """
        Calculate log probability log(P(word|context)).

        Args:
            word: The word to calculate probability for
            context: The preceding (n-1) words

        Returns:
            The log probability log(P(word|context))
        """
        probability = self.get_laplace_probability(word, context)
        return math.log2(probability)

    def get_sentence_log_probability(self, sentence: List[str]) -> float:
        """
        Calculate the log probability of a sentence.

        Args:
            sentence: List of tokens in the sentence

        Returns:
            The log probability of the sentence
        """
        # Replace OOV words with UNK
        processed_sentence = [token if token in self.vocabulary else self.UNK for token in sentence]

        # Add start and end tokens
        augmented_sentence = self.START + processed_sentence + [self.END]

        log_prob = 0.0

        # Calculate log probability for each word given its context
        for i in range(len(self.START), len(augmented_sentence)):
            word = augmented_sentence[i]
            context = tuple(augmented_sentence[i - self.n + 1:i])

            log_prob += self.get_log_probability(word, context)

        return log_prob

## Part 2

In [117]:
# PART 2: CROSS-ENTROPY AND PERPLEXITY EVALUATION
# ==============================================

def calculate_cross_entropy(model: NGramLanguageModel, test_corpus: List[str]) -> float:
    """
    Calculate cross-entropy of a language model on a test corpus.

    Args:
        model: Trained language model
        test_corpus: List of test sentences

    Returns:
        Cross-entropy value
    """
    # Preprocess test corpus
    tokenized_sentences = model.preprocess_text(test_corpus)

    # Replace OOV words
    processed_sentences = model.replace_oov_words(tokenized_sentences)

    total_log_prob = 0.0
    total_tokens = 0

    # Calculate log probability for each sentence
    for sentence in processed_sentences:
        # We count end tokens but not start tokens in the total length
        total_tokens += len(sentence) + 1  # +1 for END token

        # Add start and end tokens
        augmented_sentence = model.START + sentence + [model.END]

        # Sum log probabilities for each word given its context
        for i in range(len(model.START), len(augmented_sentence)):
            word = augmented_sentence[i]
            context = tuple(augmented_sentence[i - model.n + 1:i])

            # Get log probability
            log_prob = model.get_log_probability(word, context)
            total_log_prob += log_prob

    # Calculate cross-entropy
    cross_entropy = -total_log_prob / total_tokens

    return cross_entropy

In [118]:
def calculate_perplexity(cross_entropy: float) -> float:
    """
    Calculate perplexity from cross-entropy.

    Args:
        cross_entropy: Cross-entropy value

    Returns:
        Perplexity value
    """
    return 2 ** cross_entropy

## Part 3

In [119]:
def generate_text(model: NGramLanguageModel,
                 prompt: List[str],
                 max_length: int = 20,
                 method: str = "greedy",
                 top_k: int = 5,
                 temperature: float = 1.0) -> List[str]:
    """
    Generate text continuation based on the prompt.

    Args:
        model: Trained language model
        prompt: Initial words to continue from
        max_length: Maximum length of the generated sequence
        method: Generation method - "greedy", "topk", or "nucleus"
        top_k: Number of top candidates to consider for sampling
        temperature: Controls randomness (higher = more random)

    Returns:
        List of words completing the prompt
    """
    # Process the prompt
    processed_prompt = [word if word in model.vocabulary else model.UNK for word in prompt]

    # Initialize with start tokens + prompt
    generated_text = model.START + processed_prompt

    # Generate text until we reach max_length or end token
    for _ in range(max_length):
        # Get the most recent (n-1) words as context
        context = tuple(generated_text[-(model.n - 1):])

        # Get next word based on the specified method
        if method == "greedy":
            next_word = get_next_word_greedy(model, context)
        elif method == "topk":
            next_word = get_next_word_topk(model, context, top_k, temperature)
        elif method == "nucleus":
            next_word = get_next_word_nucleus(model, context, p=0.9, temperature=temperature)
        else:
            raise ValueError(f"Unknown generation method: {method}")

        # Add the generated word to the sequence
        generated_text.append(next_word)

        # Stop if we generated the end token
        if next_word == model.END:
            break

    # Return only the newly generated part (excluding start tokens and prompt)
    return generated_text[len(model.START) + len(processed_prompt):]

In [120]:
def get_next_word_greedy(model: NGramLanguageModel, context: tuple) -> str:
    """
    Get the most probable next word given the context.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)

    Returns:
        Most probable next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Return the word with the highest probability
    return max(candidates.items(), key=lambda x: x[1])[0]

In [121]:
def get_next_word_topk(model: NGramLanguageModel,
                      context: tuple,
                      k: int = 5,
                      temperature: float = 1.0) -> str:
    """
    Sample next word from top-k most probable words.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)
        k: Number of top candidates to consider
        temperature: Controls randomness (higher = more random)

    Returns:
        Sampled next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Get top-k candidates
    top_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:k]

    # Apply temperature scaling
    if temperature != 1.0:
        probs = np.array([prob for _, prob in top_candidates])
        probs = np.power(probs, 1.0 / temperature)
        probs = probs / np.sum(probs)
    else:
        probs = np.array([prob for _, prob in top_candidates])
        probs = probs / np.sum(probs)

    # Sample from the distribution
    words = [word for word, _ in top_candidates]
    next_word = np.random.choice(words, p=probs)

    return next_word

In [122]:
def get_next_word_nucleus(model: NGramLanguageModel,
                         context: tuple,
                         p: float = 0.9,
                         temperature: float = 1.0) -> str:
    """
    Nucleus (top-p) sampling for next word prediction.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)
        p: Cumulative probability threshold
        temperature: Controls randomness (higher = more random)

    Returns:
        Sampled next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Sort candidates by probability
    sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    # Apply temperature scaling
    if temperature != 1.0:
        probs = np.array([prob for _, prob in sorted_candidates])
        probs = np.power(probs, 1.0 / temperature)
        probs = probs / np.sum(probs)
    else:
        probs = np.array([prob for _, prob in sorted_candidates])
        probs = probs / np.sum(probs)

    # Calculate cumulative probabilities
    cumulative_probs = np.cumsum(probs)

    # Find smallest set of words with cumulative probability >= p
    cutoff_idx = np.where(cumulative_probs >= p)[0][0] + 1

    # Select only those candidates
    top_p_candidates = sorted_candidates[:cutoff_idx]

    # Re-normalize probabilities
    top_p_probs = np.array([prob for _, prob in top_p_candidates])
    top_p_probs = top_p_probs / np.sum(top_p_probs)

    # Sample from the distribution
    words = [word for word, _ in top_p_candidates]
    next_word = np.random.choice(words, p=top_p_probs)

    return next_word

In [123]:
def beam_search(model: NGramLanguageModel,
               prompt: List[str],
               beam_width: int = 5,
               max_length: int = 20) -> List[List[str]]:
    """
    Beam search for text generation.

    Args:
        model: Trained language model
        prompt: Initial words to continue from
        beam_width: Beam width
        max_length: Maximum length of the generated sequence

    Returns:
        List of generated sequences (beams)
    """
    # Process the prompt
    processed_prompt = [word if word in model.vocabulary else model.UNK for word in prompt]

    # Initialize beams with start tokens + prompt
    initial_sequence = model.START + processed_prompt
    beams = [(initial_sequence, 0.0)]  # (sequence, log_prob)

    # Generate for max_length steps
    for _ in range(max_length):
        new_beams = []

        # Expand each beam
        for sequence, score in beams:
            # If the sequence ended, keep it as is
            if sequence[-1] == model.END:
                new_beams.append((sequence, score))
                continue

            # Get context
            context = tuple(sequence[-(model.n - 1):])

            # Calculate probabilities for all possible next words
            candidates = {}
            for word in model.vocabulary:
                # Skip UNK token for generation
                if word == model.UNK:
                    continue

                log_prob = model.get_log_probability(word, context)
                candidates[word] = log_prob

            # Get top candidates
            top_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:beam_width]

            # Create new beams with expanded sequences
            for word, log_prob in top_candidates:
                new_sequence = sequence + [word]
                new_score = score + log_prob
                new_beams.append((new_sequence, new_score))

        # Select top beams
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

        # Check if all beams have ended
        if all(sequence[-1] == model.END for sequence, _ in beams):
            break

    # Return only the newly generated parts (excluding start tokens and prompt)
    start_len = len(model.START) + len(processed_prompt)
    return [sequence[start_len:] for sequence, _ in beams]

## Part 4

In [None]:
def log_prob(p: float) -> float:
    """
    Compute log-probability with safe handling for zero.

    Args:
        p: A probability value

    Returns:
        Natural log of p, or -inf if p is 0
    """
    return math.log(p) if p > 0 else float('-inf')


def softmax(scores):
    """
    Compute softmax over a list of scores.

    Args:
        scores: List of float scores

    Returns:
        List of normalized probabilities
    """
    exp_scores = [math.exp(s - max(scores)) for s in scores]
    total = sum(exp_scores)
    return [e / total for e in exp_scores]


def calculate_lm_score(candidate: str, context: Tuple[str, ...], model: NGramLanguageModel) -> float:
    """
    Get the language model log-probability of a candidate given context.

    Args:
        candidate: Word to score
        context: Tuple of previous words
        model: Trained N-gram language model

    Returns:
        Log-probability of candidate given context
    """
    return model.get_log_probability(candidate, context)


def calculate_error_score(noisy_token: str, candidate: str) -> float:
    """
    Compute the log-probability of a candidate based on its edit distance.

    Args:
        noisy_token: Original word
        candidate: Possible correction

    Returns:
        Log-probability based on inverse edit distance
    """
    edit_dist = nltk.edit_distance(noisy_token, candidate)
    return log_prob(1 / (edit_dist + 1))


def combine_scores(lm_score: float, error_score: float, lambda_lm: float = 0.8, lambda_err: float = 0.2) -> float:
    """
    Combine language model and error model scores using weighted sum.

    Args:
        lm_score: Log-probability from language model
        error_score: Log-probability from error model
        lambda_lm: Weight for language model
        lambda_err: Weight for error model

    Returns:
        Combined score
    """
    return lambda_lm * lm_score + lambda_err * error_score


def generate_candidates(
    noisy_token: str,
    vocabulary: Set[str],
    max_edit_distance: int = 2,
    skip_oov: bool = True
) -> List[str]:
    """
    Generate candidate corrections within a max edit distance.

    Args:
        noisy_token: Token to correct
        vocabulary: Set of known words
        max_edit_distance: Max allowed edit distance
        skip_oov: If True, return token if it’s in vocabulary

    Returns:
        List of candidate words
    """
    if skip_oov and noisy_token in vocabulary:
        return [noisy_token]

    candidates = []
    for word in vocabulary:
        if word == "<UNK>":
            continue
        if nltk.edit_distance(noisy_token, word) <= max_edit_distance:
            candidates.append(word)
    return candidates or [noisy_token]

def beam_search_step(
    beams: List[Tuple[List[str], float]],
    noisy_token: str,
    model: NGramLanguageModel,
    vocabulary: Set[str],
    beam_width: int = 5,
    lambda_lm: float = 0.8,
    lambda_err: float = 0.2,
    max_edit_distance: int = 2,
    skip_oov: bool = True
) -> List[Tuple[List[str], float]]:
    """
    Expand beam sequences with possible corrections for the next token.

    Args:
        beams: List of (sequence, score) pairs
        noisy_token: Token to correct
        model: N-gram language model
        vocabulary: Set of valid words
        beam_width: Max beams to keep
        lambda_lm: LM score weight
        lambda_err: Error score weight
        max_edit_distance: Edit distance threshold
        skip_oov: If True, use original token if in vocab

    Returns:
        Updated list of top-k beams
    """
    new_beams = []

    for sequence, score in beams:
        context = tuple(sequence[-(model.n - 1):])
        candidates = generate_candidates(noisy_token, vocabulary, max_edit_distance, skip_oov)

        print(f"Token: '{noisy_token}' | Context: {context}")
        print("Candidate scores:")
        for candidate in candidates:
            lm_score = calculate_lm_score(candidate, context, model)
            err_score = calculate_error_score(noisy_token, candidate)
            total_score = combine_scores(lm_score, err_score, lambda_lm, lambda_err)
            print(f"  {candidate:<12} | LM: {lm_score:+8.4f} | Error: {err_score:+8.4f} | Combined: {total_score:+8.4f}")

            new_sequence = sequence + [candidate]
            new_score = score + total_score
            new_beams.append((new_sequence, new_score))

    if not new_beams:
        print(f"Warning: No valid candidates for '{noisy_token}'. Using fallback.")
        for sequence, score in beams:
            new_sequence = sequence + [noisy_token]
        new_beams.append((new_sequence, score))

    top_beams = heapq.nlargest(beam_width, new_beams, key=lambda x: x[1])
    top_tokens = [beam[0][-1] for beam in top_beams]
    print(f"Top selected tokens: {top_tokens}\n")

    return top_beams

In [199]:
def context_aware_spelling_corrector(
    model,
    noisy_sentence: List[str],
    beam_width: int = 5,
    lambda_lm: float = 0.8,
    lambda_err: float = 0.2,
    max_edit_distance: int = 2,
    skip_oov: bool = True,
) -> List[str]:
    """
    Perform context-aware spelling correction on a noisy sentence using beam search.

    Args:
        model: Trained N-gram language model with .START and .vocabulary
        noisy_sentence: List of potentially misspelled tokens
        beam_width: Number of sequences to keep per step
        lambda_lm: Weight for the language model score
        lambda_err: Weight for the error model score
        max_edit_distance: Max edit distance for generating candidates
        skip_oov: If True, preserve in-vocabulary words

    Returns:
        A list of corrected tokens
    """
    print(f"Starting correction for sentence: {' '.join(noisy_sentence)}")
    beams = [(model.START, 0.0)]
    for noisy_token in noisy_sentence:
        beams = beam_search_step(
            beams, noisy_token, model, model.vocabulary,
            beam_width, lambda_lm, lambda_err, max_edit_distance, skip_oov
        )
    best_sequence = max(beams, key=lambda x: x[1])[0]
    corrected = best_sequence[len(model.START):]
    print(f"Final corrected sentence: {' '.join(corrected)}")
    return corrected


In [188]:
train_corpus, val_corpus, test_corpus = load_and_split_corpus(corpus_name='brown')

# Initialize and train models
bigram_model = NGramLanguageModel(n=2, min_freq=10)
bigram_model.train(train_corpus)

trigram_model = NGramLanguageModel(n=3, min_freq=10)
trigram_model.train(train_corpus)

Loading brown corpus...
Corpus split: 39621 train, 8490 validation, 8491 test sentences
Training 2-gram model on 39621 sentences...
Vocabulary size: 6637 words
Extracted 209440 unique 2-grams
Total tokens in corpus: 859931
Training 3-gram model on 39621 sentences...
Vocabulary size: 6638 words
Extracted 519364 unique 3-grams
Total tokens in corpus: 859931


In [200]:
test_sentences = [
    "let us sey we are friendz",
    "in consequencaae of her sistero's marriange, been moistress of hois house from a vry early period",
    "Tomorrrow well bring somethiing new, so leav today as a memoory.",
    "He wento too the storr to by some bred and mlik.",
    "Ths is an exampel of a sentense with severl erors.",
    "Wee shuld definately do ths agan some day."
]

for sentence in test_sentences:
    print(f'\nTesting with sentence: "{sentence}"')
    corrected = context_aware_spelling_corrector(
        bigram_model,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.95,
        lambda_err=0.05,
        skip_oov=True
    )
    print("Bigram model produced:")
    print(" ", " ".join(corrected))


Testing with sentence: "let us sey we are friendz"
Starting correction for sentence: let us sey we are friendz
Token: 'let' | Context: ('<start>',)
Candidate scores:
  let          | LM:  -9.6148 | Error:  +0.0000 | Combined:  -9.1340
Top selected tokens: ['let']

Token: 'us' | Context: ('let',)
Candidate scores:
  us           | LM:  -7.3349 | Error:  +0.0000 | Combined:  -6.9681
Top selected tokens: ['us']

Token: 'sey' | Context: ('us',)
Candidate scores:
  sad          | LM: -12.7922 | Error:  -1.0986 | Combined: -12.2075
  so           | LM: -11.7922 | Error:  -1.0986 | Combined: -11.2575
  keys         | LM: -12.7922 | Error:  -1.0986 | Combined: -12.2075
  sera         | LM: -12.7922 | Error:  -1.0986 | Combined: -12.2075
  why          | LM: -12.7922 | Error:  -1.0986 | Combined: -12.2075
  fee          | LM: -12.7922 | Error:  -1.0986 | Combined: -12.2075
  s            | LM: -12.7922 | Error:  -1.0986 | Combined: -12.2075
  ed           | LM: -12.7922 | Error:  -1.0986 | Com

In [176]:
test_sentences = [
    "let us sey we are friendz",
    "in consequencaae of her sistero's marriange, been moistress of hois house from a vry early period",
    "Tomorrrow well bring somethiing new, so leav today as a memoory.",
    "He wento too the storr to by some bred and mlik.",
    "Ths is an exampel of a sentense with severl erors.",
    "Wee shuld definately do ths agan some day."
]

for sentence in test_sentences:
    print(f'\nTesting with sentence: "{sentence}"')
    corrected = context_aware_spelling_corrector(
        trigram_model,
        nltk.word_tokenize(sentence),
        beam_width=5,
        lambda_lm=0.9,
        lambda_err=0.1,
        skip_oov=True
    )
    print("Bigram model produced:")
    print(" ", " ".join(corrected))


Testing with sentence: "let us sey we are friendz"
Bigram model produced:
  let us sey we are friend

Testing with sentence: "in consequencaae of her sistero's marriange, been moistress of hois house from a vry early period"
Bigram model produced:
  in consequence of her sister , marianne , been mistress of his house from a very early period

Testing with sentence: "Tomorrrow well bring somethiing new, so leav today as a memoory."
Bigram model produced:
  tomorrow well bring something new , so let today as a memory .

Testing with sentence: "He wento too the storr to by some bred and mlik."
Bigram model produced:
  he went too the store to by some bred and mink .

Testing with sentence: "Ths is an exampel of a sentense with severl erors."
Bigram model produced:
  he is an example of a sentence with severe errors .

Testing with sentence: "Wee shuld definately do ths agan some day."
Bigram model produced:
  he could definately do the aged some day .


## Part 5

In [43]:
# Part 5: Artificial Test Dataset (class + usage example)
# ------------------------------------------------------------------
# Before running this cell, make sure you have already defined `test_corpus` 
# (e.g., via:
#    train_corpus, val_corpus, test_corpus = load_and_split_corpus(
#        corpus_name='reuters', min_sentences=90000)
# ) so that `test_corpus` is a List[str] of clean sentences.
class ArtificialTestDataset:
    def __init__(self, sentences, error_prob=0.05, seed=None):
        """
        Initialize the artificial test dataset generator.

        Args:
            sentences (List[str]): List of clean sentences to corrupt.
            error_prob (float): Probability of replacing each non-space character.
            seed (int, optional): Random seed for reproducibility.
        """
        self.sentences = sentences
        self.error_prob = error_prob
        if seed is not None:
            random.seed(seed)
        # Character set for random replacements (excluding space)
        self.chars = list(string.ascii_letters + string.digits + string.punctuation)

    def _corrupt_char(self, c):
        # Do not corrupt whitespace; apply corruption with given probability
        if c.isspace() or random.random() > self.error_prob:
            return c
        # Choose a random replacement different from the original
        replacement = random.choice(self.chars)
        while replacement == c:
            replacement = random.choice(self.chars)
        return replacement

    def generate(self):
        """
        Generate the corrupted dataset.

        Returns:
            List[str]: Corrupted sentences.
        """
        corrupted = []
        for sentence in self.sentences:
            corrupted_sentence = ''.join(self._corrupt_char(c) for c in sentence)
            corrupted.append(corrupted_sentence)
        return corrupted

In [44]:
# Now, with test_corpus loaded, generate the corrupted dataset and display samples:
generator = ArtificialTestDataset(test_corpus, error_prob=0.05, seed=42)
corrupted_test_corpus = generator.generate()

print("Showing original vs. corrupted for first 5 sentences:")
print("=" * 60)
for orig, corrupt in zip(test_corpus[:5], corrupted_test_corpus[:5]):
    print(f"Original:  {orig}")
    print(f"Corrupted: {corrupt}")
    print("-" * 60)

Showing original vs. corrupted for first 5 sentences:
Original:  O beautiful for patriot dream that sees beyond the years thine alabaster cities gleam undimmed by human tears .
Corrupted: O Jeautifll fzr patriot dream that 6ees beyond Dhe years thine alabaster cities gleam undimmed by human tears .
------------------------------------------------------------
Original:  ( cf.
Corrupted: ( cf.
------------------------------------------------------------
Original:  The Village office of Western Union with George Towsley as manager and telegrapher continued in Hard's drugstore until 1905 .
Corrupted: The Villag} office of WesterP Union with George Towsley as manager )nd telegrapher continued in Hard's drugstore until 1905 .
------------------------------------------------------------
Original:  As if this was a signal , Poet abruptly began to thrash the water and the quick movement slowly made them sink through the water .
Corrupted: As if this was a signal , Poet abruptly began to thrash 

# MAIN

In [184]:
def load_and_split_corpus(corpus_name='reuters', min_sentences=-1):
    """
    Load and split a corpus from NLTK into train, validation, and test sets.

    Args:
        corpus_name: Name of the corpus to load
        min_sentences: Minimum number of sentences to include

    Returns:
        Tuple of (train_corpus, val_corpus, test_corpus)
    """
    print(f"Loading {corpus_name} corpus...")

    if corpus_name == 'reuters':
        from nltk.corpus import reuters
        sentences = [" ".join(reuters.words(fileid)) for fileid in reuters.fileids()]

    elif corpus_name == 'brown':
        from nltk.corpus import brown
        sentences = [" ".join(brown.words(fileid)) for fileid in brown.fileids()]

    elif corpus_name == 'gutenberg':
        from nltk.corpus import gutenberg
        sentences = [" ".join(gutenberg.words(fileid)) for fileid in gutenberg.fileids()]

    elif corpus_name == 'all':
        from nltk.corpus import reuters, brown, gutenberg
        sentences = []

        # Combine texts from all three corpora
        sentences += [" ".join(reuters.words(fileid)) for fileid in reuters.fileids()]
        sentences += [" ".join(brown.words(fileid)) for fileid in brown.fileids()]
        sentences += [" ".join(gutenberg.words(fileid)) for fileid in gutenberg.fileids()]

    else:
        raise ValueError(f"Unknown corpus: {corpus_name}")

    # Break into actual sentences
    all_sentences = []
    for text in sentences:
        all_sentences.extend(nltk.sent_tokenize(text))


    # Ensure we have enough sentences
    if len(all_sentences) < min_sentences:
        raise ValueError(f"Corpus {corpus_name} has only {len(all_sentences)} sentences, "
                         f"which is less than the required {min_sentences}.")

    # Shuffle sentences
    random.seed(42)
    random.shuffle(all_sentences)

    # Take a subset for faster processing if needed
    sentences_subset = all_sentences[:min_sentences]

    # Split into train, validation, and test sets (70%, 15%, 15%)
    train_size = int(0.7 * len(sentences_subset))
    val_size = int(0.15 * len(sentences_subset))

    train_corpus = sentences_subset[:train_size]
    val_corpus = sentences_subset[train_size:train_size + val_size]
    test_corpus = sentences_subset[train_size + val_size:]

    print(f"Corpus split: {len(train_corpus)} train, {len(val_corpus)} validation, {len(test_corpus)} test sentences")

    return train_corpus, val_corpus, test_corpus

In [None]:
"""
Run the full language modeling experiment.
"""
# Part 1
# Load and split corpus
train_corpus, val_corpus, test_corpus = load_and_split_corpus(corpus_name='reuters', min_sentences=90000)

# Initialize and train models
bigram_model = NGramLanguageModel(n=2, min_freq=10)
bigram_model.train(train_corpus)

trigram_model = NGramLanguageModel(n=3, min_freq=10)
trigram_model.train(train_corpus)

# Ensure both models use the same vocabulary
common_vocab = bigram_model.vocabulary.intersection(trigram_model.vocabulary)
bigram_model.vocabulary = common_vocab
trigram_model.vocabulary = common_vocab
bigram_model.vocabulary_size = len(common_vocab)
trigram_model.vocabulary_size = len(common_vocab)

print(f"Common vocabulary size: {len(common_vocab)}")

# Part 2
# Calculate cross-entropy and perplexity on validation set
print("\nEvaluating on validation set...")

bigram_ce_val = calculate_cross_entropy(bigram_model, val_corpus)
bigram_ppl_val = calculate_perplexity(bigram_ce_val)

trigram_ce_val = calculate_cross_entropy(trigram_model, val_corpus)
trigram_ppl_val = calculate_perplexity(trigram_ce_val)

print(f"Bigram model - Cross-entropy: {bigram_ce_val:.4f}, Perplexity: {bigram_ppl_val:.4f}")
print(f"Trigram model - Cross-entropy: {trigram_ce_val:.4f}, Perplexity: {trigram_ppl_val:.4f}")

# Calculate cross-entropy and perplexity on test set
print("\nEvaluating on test set...")

bigram_ce_test = calculate_cross_entropy(bigram_model, test_corpus)
bigram_ppl_test = calculate_perplexity(bigram_ce_test)

trigram_ce_test = calculate_cross_entropy(trigram_model, test_corpus)
trigram_ppl_test = calculate_perplexity(trigram_ce_test)

print(f"Bigram model - Cross-entropy: {bigram_ce_test:.4f}, Perplexity: {bigram_ppl_test:.4f}")
print(f"Trigram model - Cross-entropy: {trigram_ce_test:.4f}, Perplexity: {trigram_ppl_test:.4f}")

# Part 3
# Generate text completions
print("\nGenerating text completions:")

prompts = [
"I would like to",
"The president of",
"According to recent",
"In the last few",
"Experts say that"
]

print("\nBigram model completions:")
for prompt in prompts:
  prompt_tokens = nltk.word_tokenize(prompt.lower())

  # Generate with greedy decoding
  completion_greedy = generate_text(bigram_model, prompt_tokens, method="greedy")
  completion_text_greedy = prompt + " " + " ".join([w for w in completion_greedy if w != bigram_model.END])
  print(f"[Greedy] {completion_text_greedy}")

  # Generate with top-k sampling
  completion_topk = generate_text(bigram_model, prompt_tokens, method="topk", top_k=5, temperature=0.7)
  completion_text_topk = prompt + " " + " ".join([w for w in completion_topk if w != bigram_model.END])
  print(f"[Top-K] {completion_text_topk}")

  # Generate with beam search
  beam_completions = beam_search(bigram_model, prompt_tokens, beam_width=3)
  top_beam = beam_completions[0]
  completion_text_beam = prompt + " " + " ".join([w for w in top_beam if w != bigram_model.END])
  print(f"[Beam] {completion_text_beam}")
  print()

print("\nTrigram model completions:")
for prompt in prompts:
  prompt_tokens = nltk.word_tokenize(prompt.lower())

  # Generate with greedy decoding
  completion_greedy = generate_text(trigram_model, prompt_tokens, method="greedy")
  completion_text_greedy = prompt + " " + " ".join([w for w in completion_greedy if w != trigram_model.END])
  print(f"[Greedy] {completion_text_greedy}")

  # Generate with top-k sampling
  completion_topk = generate_text(trigram_model, prompt_tokens, method="topk", top_k=5, temperature=0.7)
  completion_text_topk = prompt + " " + " ".join([w for w in completion_topk if w != trigram_model.END])
  print(f"[Top-K] {completion_text_topk}")

  # Generate with beam search
  beam_completions = beam_search(trigram_model, prompt_tokens, beam_width=3)
  top_beam = beam_completions[0]
  completion_text_beam = prompt + " " + " ".join([w for w in top_beam if w != trigram_model.END])
  print(f"[Beam] {completion_text_beam}")
  print()