## Intro

In [5]:
import nltk
import re
import math
import random
import numpy as np
from nltk.corpus import reuters
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Set
from tqdm import tqdm

# Download necessary resources
nltk.download('reuters')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/michaeltheophanopoulos/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Part 1

In [6]:
# PART 1: N-GRAM LANGUAGE MODEL IMPLEMENTATION
# ===========================================

class NGramLanguageModel:
    def __init__(self, n: int, min_freq: int = 10):
        """
        Initialize an n-gram language model.

        Args:
            n: The size of n-grams (2 for bigram, 3 for trigram)
            min_freq: Minimum frequency to include a word in vocabulary
        """
        self.n = n
        self.min_freq = min_freq

        # Main model components
        self.vocabulary = set()  # Words in the vocabulary
        self.word_counts = Counter()  # Counts of individual words
        self.ngram_counts = defaultdict(Counter)  # Counts of n-grams
        self.context_counts = defaultdict(int)  # Counts of (n-1)-grams (contexts)

        # Model constants
        self.UNK = "<UNK>"  # Out-of-vocabulary token
        self.END = "<end>"  # End of sentence token

        # Different start tokens for different n values
        if n == 2:
            self.START = ["<start>"]
        elif n == 3:
            self.START = ["<start1>", "<start2>"]
        else:
            self.START = [f"<start{i}>" for i in range(1, n)]

        # Statistics
        self.total_sentences = 0
        self.total_tokens = 0
        self.vocabulary_size = 0

    def preprocess_text(self, sentences: List[str]) -> List[List[str]]:
        """
        Preprocess raw sentences into tokenized form.

        Args:
            sentences: List of raw text sentences

        Returns:
            List of tokenized sentences
        """
        tokenized_sentences = []

        for sentence in sentences:
            # Clean and tokenize the sentence
            clean_sentence = sentence.lower().strip()
            tokens = nltk.word_tokenize(clean_sentence)
            tokenized_sentences.append(tokens)

        return tokenized_sentences

    def build_vocabulary(self, tokenized_sentences: List[List[str]]) -> Set[str]:
        """
        Build vocabulary from tokenized sentences based on minimum frequency.

        Args:
            tokenized_sentences: List of tokenized sentences

        Returns:
            Set of vocabulary words
        """
        # Count word occurrences
        word_counter = Counter()
        for sentence in tokenized_sentences:
            word_counter.update(sentence)

        # Create vocabulary with words that meet minimum frequency
        vocabulary = {word for word, count in word_counter.items()
                     if count >= self.min_freq}

        # Always add special tokens to vocabulary
        vocabulary.add(self.UNK)
        vocabulary.add(self.END)
        for token in self.START:
            vocabulary.add(token)

        return vocabulary

    def replace_oov_words(self, tokenized_sentences: List[List[str]]) -> List[List[str]]:
        """
        Replace out-of-vocabulary words with UNK token.

        Args:
            tokenized_sentences: List of tokenized sentences

        Returns:
            List of tokenized sentences with OOV words replaced
        """
        processed_sentences = []

        for sentence in tokenized_sentences:
            processed_sentence = []
            for token in sentence:
                if token in self.vocabulary:
                    processed_sentence.append(token)
                else:
                    processed_sentence.append(self.UNK)
            processed_sentences.append(processed_sentence)

        return processed_sentences

    def extract_ngrams(self, tokenized_sentences: List[List[str]]) -> None:
        """
        Extract n-grams from tokenized sentences and count their occurrences.

        Args:
            tokenized_sentences: List of tokenized sentences with OOV words replaced
        """
        for sentence in tokenized_sentences:
            # Add start and end tokens
            augmented_sentence = self.START + sentence + [self.END]
            self.total_tokens += len(sentence) + 1  # +1 for END token

            # Count individual words (unigrams)
            self.word_counts.update(augmented_sentence)

            # Extract and count n-grams
            for i in range(len(augmented_sentence) - self.n + 1):
                ngram = tuple(augmented_sentence[i:i + self.n])
                prefix = ngram[:-1]  # Context (n-1 gram)
                word = ngram[-1]     # Word being predicted

                self.ngram_counts[prefix][word] += 1
                self.context_counts[prefix] += 1

    def train(self, corpus: List[str]) -> None:
        """
        Train the n-gram language model on the provided corpus.

        Args:
            corpus: List of sentences
        """
        self.total_sentences = len(corpus)
        print(f"Training {self.n}-gram model on {self.total_sentences} sentences...")

        # Preprocess the corpus
        tokenized_sentences = self.preprocess_text(corpus)

        # Build vocabulary
        self.vocabulary = self.build_vocabulary(tokenized_sentences)
        self.vocabulary_size = len(self.vocabulary)
        print(f"Vocabulary size: {self.vocabulary_size} words")

        # Replace OOV words
        processed_sentences = self.replace_oov_words(tokenized_sentences)

        # Extract n-grams
        self.extract_ngrams(processed_sentences)

        print(f"Extracted {sum(len(counts) for counts in self.ngram_counts.values())} unique {self.n}-grams")
        print(f"Total tokens in corpus: {self.total_tokens}")

    def get_laplace_probability(self, word: str, context: tuple) -> float:
        """
        Calculate Laplace-smoothed probability P(word|context).

        Args:
            word: The word to calculate probability for
            context: The preceding (n-1) words

        Returns:
            The conditional probability P(word|context)
        """
        # Get counts with Laplace smoothing
        count_ngram = self.ngram_counts[context][word]
        count_context = self.context_counts[context]

        # Apply Laplace smoothing (+1 to numerator, +V to denominator)
        probability = (count_ngram + 1) / (count_context + self.vocabulary_size)

        return probability

    def get_log_probability(self, word: str, context: tuple) -> float:
        """
        Calculate log probability log(P(word|context)).

        Args:
            word: The word to calculate probability for
            context: The preceding (n-1) words

        Returns:
            The log probability log(P(word|context))
        """
        probability = self.get_laplace_probability(word, context)
        return math.log2(probability)

    def get_sentence_log_probability(self, sentence: List[str]) -> float:
        """
        Calculate the log probability of a sentence.

        Args:
            sentence: List of tokens in the sentence

        Returns:
            The log probability of the sentence
        """
        # Replace OOV words with UNK
        processed_sentence = [token if token in self.vocabulary else self.UNK for token in sentence]

        # Add start and end tokens
        augmented_sentence = self.START + processed_sentence + [self.END]

        log_prob = 0.0

        # Calculate log probability for each word given its context
        for i in range(len(self.START), len(augmented_sentence)):
            word = augmented_sentence[i]
            context = tuple(augmented_sentence[i - self.n + 1:i])

            log_prob += self.get_log_probability(word, context)

        return log_prob

## Part 2

In [7]:
# PART 2: CROSS-ENTROPY AND PERPLEXITY EVALUATION
# ==============================================

def calculate_cross_entropy(model: NGramLanguageModel, test_corpus: List[str]) -> float:
    """
    Calculate cross-entropy of a language model on a test corpus.

    Args:
        model: Trained language model
        test_corpus: List of test sentences

    Returns:
        Cross-entropy value
    """
    # Preprocess test corpus
    tokenized_sentences = model.preprocess_text(test_corpus)

    # Replace OOV words
    processed_sentences = model.replace_oov_words(tokenized_sentences)

    total_log_prob = 0.0
    total_tokens = 0

    # Calculate log probability for each sentence
    for sentence in processed_sentences:
        # We count end tokens but not start tokens in the total length
        total_tokens += len(sentence) + 1  # +1 for END token

        # Add start and end tokens
        augmented_sentence = model.START + sentence + [model.END]

        # Sum log probabilities for each word given its context
        for i in range(len(model.START), len(augmented_sentence)):
            word = augmented_sentence[i]
            context = tuple(augmented_sentence[i - model.n + 1:i])

            # Get log probability
            log_prob = model.get_log_probability(word, context)
            total_log_prob += log_prob

    # Calculate cross-entropy
    cross_entropy = -total_log_prob / total_tokens

    return cross_entropy

In [8]:
def calculate_perplexity(cross_entropy: float) -> float:
    """
    Calculate perplexity from cross-entropy.

    Args:
        cross_entropy: Cross-entropy value

    Returns:
        Perplexity value
    """
    return 2 ** cross_entropy

## Part 3

In [9]:
def generate_text(model: NGramLanguageModel,
                 prompt: List[str],
                 max_length: int = 20,
                 method: str = "greedy",
                 top_k: int = 5,
                 temperature: float = 1.0) -> List[str]:
    """
    Generate text continuation based on the prompt.

    Args:
        model: Trained language model
        prompt: Initial words to continue from
        max_length: Maximum length of the generated sequence
        method: Generation method - "greedy", "topk", or "nucleus"
        top_k: Number of top candidates to consider for sampling
        temperature: Controls randomness (higher = more random)

    Returns:
        List of words completing the prompt
    """
    # Process the prompt
    processed_prompt = [word if word in model.vocabulary else model.UNK for word in prompt]

    # Initialize with start tokens + prompt
    generated_text = model.START + processed_prompt

    # Generate text until we reach max_length or end token
    for _ in range(max_length):
        # Get the most recent (n-1) words as context
        context = tuple(generated_text[-(model.n - 1):])

        # Get next word based on the specified method
        if method == "greedy":
            next_word = get_next_word_greedy(model, context)
        elif method == "topk":
            next_word = get_next_word_topk(model, context, top_k, temperature)
        elif method == "nucleus":
            next_word = get_next_word_nucleus(model, context, p=0.9, temperature=temperature)
        else:
            raise ValueError(f"Unknown generation method: {method}")

        # Add the generated word to the sequence
        generated_text.append(next_word)

        # Stop if we generated the end token
        if next_word == model.END:
            break

    # Return only the newly generated part (excluding start tokens and prompt)
    return generated_text[len(model.START) + len(processed_prompt):]

In [10]:
def get_next_word_greedy(model: NGramLanguageModel, context: tuple) -> str:
    """
    Get the most probable next word given the context.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)

    Returns:
        Most probable next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Return the word with the highest probability
    return max(candidates.items(), key=lambda x: x[1])[0]

In [12]:
def get_next_word_topk(model: NGramLanguageModel,
                      context: tuple,
                      k: int = 5,
                      temperature: float = 1.0) -> str:
    """
    Sample next word from top-k most probable words.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)
        k: Number of top candidates to consider
        temperature: Controls randomness (higher = more random)

    Returns:
        Sampled next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Get top-k candidates
    top_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:k]

    # Apply temperature scaling
    if temperature != 1.0:
        probs = np.array([prob for _, prob in top_candidates])
        probs = np.power(probs, 1.0 / temperature)
        probs = probs / np.sum(probs)
    else:
        probs = np.array([prob for _, prob in top_candidates])
        probs = probs / np.sum(probs)

    # Sample from the distribution
    words = [word for word, _ in top_candidates]
    next_word = np.random.choice(words, p=probs)

    return next_word

In [13]:
def get_next_word_nucleus(model: NGramLanguageModel,
                         context: tuple,
                         p: float = 0.9,
                         temperature: float = 1.0) -> str:
    """
    Nucleus (top-p) sampling for next word prediction.

    Args:
        model: Trained language model
        context: Current context ((n-1) preceding words)
        p: Cumulative probability threshold
        temperature: Controls randomness (higher = more random)

    Returns:
        Sampled next word
    """
    # Get probabilities for all words in the vocabulary
    candidates = {}

    for word in model.vocabulary:
        # Skip UNK token for generation
        if word == model.UNK:
            continue

        prob = model.get_laplace_probability(word, context)
        candidates[word] = prob

    # Sort candidates by probability
    sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    # Apply temperature scaling
    if temperature != 1.0:
        probs = np.array([prob for _, prob in sorted_candidates])
        probs = np.power(probs, 1.0 / temperature)
        probs = probs / np.sum(probs)
    else:
        probs = np.array([prob for _, prob in sorted_candidates])
        probs = probs / np.sum(probs)

    # Calculate cumulative probabilities
    cumulative_probs = np.cumsum(probs)

    # Find smallest set of words with cumulative probability >= p
    cutoff_idx = np.where(cumulative_probs >= p)[0][0] + 1

    # Select only those candidates
    top_p_candidates = sorted_candidates[:cutoff_idx]

    # Re-normalize probabilities
    top_p_probs = np.array([prob for _, prob in top_p_candidates])
    top_p_probs = top_p_probs / np.sum(top_p_probs)

    # Sample from the distribution
    words = [word for word, _ in top_p_candidates]
    next_word = np.random.choice(words, p=top_p_probs)

    return next_word

In [14]:
def beam_search(model: NGramLanguageModel,
               prompt: List[str],
               beam_width: int = 5,
               max_length: int = 20) -> List[List[str]]:
    """
    Beam search for text generation.

    Args:
        model: Trained language model
        prompt: Initial words to continue from
        beam_width: Beam width
        max_length: Maximum length of the generated sequence

    Returns:
        List of generated sequences (beams)
    """
    # Process the prompt
    processed_prompt = [word if word in model.vocabulary else model.UNK for word in prompt]

    # Initialize beams with start tokens + prompt
    initial_sequence = model.START + processed_prompt
    beams = [(initial_sequence, 0.0)]  # (sequence, log_prob)

    # Generate for max_length steps
    for _ in range(max_length):
        new_beams = []

        # Expand each beam
        for sequence, score in beams:
            # If the sequence ended, keep it as is
            if sequence[-1] == model.END:
                new_beams.append((sequence, score))
                continue

            # Get context
            context = tuple(sequence[-(model.n - 1):])

            # Calculate probabilities for all possible next words
            candidates = {}
            for word in model.vocabulary:
                # Skip UNK token for generation
                if word == model.UNK:
                    continue

                log_prob = model.get_log_probability(word, context)
                candidates[word] = log_prob

            # Get top candidates
            top_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:beam_width]

            # Create new beams with expanded sequences
            for word, log_prob in top_candidates:
                new_sequence = sequence + [word]
                new_score = score + log_prob
                new_beams.append((new_sequence, new_score))

        # Select top beams
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

        # Check if all beams have ended
        if all(sequence[-1] == model.END for sequence, _ in beams):
            break

    # Return only the newly generated parts (excluding start tokens and prompt)
    start_len = len(model.START) + len(processed_prompt)
    return [sequence[start_len:] for sequence, _ in beams]

## Part 4

In [None]:
def log_prob(p: float) -> float:
    return math.log(p) if p > 0 else float('-inf')

def context_aware_spelling_corrector(model: NGramLanguageModel,
                                    noisy_sentence: List[str],
                                    beam_width: int = 5, lambda_lm: float = 0.8, 
                                    lambda_err: float = 0.2) -> List[Tuple[str, float]]:
    """
    Context-aware spelling correction using noisy channel model and beam search.

    Args:
        model: Trained n-gram language model
        noisy_sentence: List of tokens with possible typos
        beam_width: Beam width for beam search
        lambda_lm: Weight for language model score
        lambda_err: Weight for error model score

    Returns:
        Best corrected sentence as a list of tokens
    """
    input_tokens = model.replace_oov_words(noisy_sentence)
    # Initialize beams that holds (sequence_so_far, total_log_score)
    beams = [(model.START, 0.0)]

    for i, noisy_token in enumerate(input_tokens):
        new_beams = []

        for sequence, score in beams:
            context = tuple(sequence[-(model.n - 1):])

            for candidate in model.vocabulary:
                if candidate == model.UNK:
                    continue
              
                lm_logp = model.get_log_probability(candidate, context)

                # Calculate the error model score (e.g., Levenshtein distance)
                from nltk.metrics.distance import edit_distance
                ld = edit_distance(noisy_token, candidate)
                err_logp = log_prob(1 / (ld + 1))

                total_score = score + lambda_lm * lm_logp + lambda_err * err_logp

                new_sequence = sequence + [candidate]
                new_beams.append((new_sequence, total_score))

        import heapq
        beams = heapq.nlargest(beam_width, new_beams, key=lambda x: x[1])

    best_sequence = max(beams, key=lambda x: x[1])[0]
    # Exclude start tokens
    return best_sequence[len(model.START):]

train_corpus, val_corpus, test_corpus = load_and_split_corpus(corpus_name='reuters', min_sentences=90000)

# Initialize and train models
bigram_model = NGramLanguageModel(n=2, min_freq=10)
bigram_model.train(train_corpus)

trigram_model = NGramLanguageModel(n=3, min_freq=10)
trigram_model.train(train_corpus)

Loading reuters corpus...
Corpus split: 62999 train, 13500 validation, 13501 test sentences
Training 2-gram model on 62999 sentences...
Vocabulary size: 6273 words
Extracted 204067 unique 2-grams
Total tokens in corpus: 1156241
['the', 'u', '.', '<end>']
Training 3-gram model on 62999 sentences...
Vocabulary size: 6274 words
Extracted 530702 unique 3-grams
Total tokens in corpus: 1156241
['the', 'company', "'", 's']


In [20]:
print(context_aware_spelling_corrector(bigram_model, ["thsi", "is", "a", "test"], beam_width=5))
print(context_aware_spelling_corrector(trigram_model, ["thsi", "is", "a", "test"], beam_width=5))


['the', 'u', '.', '<end>']
['the', 'company', "'", 's']


# MAIN

In [15]:
def load_and_split_corpus(corpus_name='reuters', min_sentences=100000):
    """
    Load and split a corpus from NLTK into train, validation, and test sets.

    Args:
        corpus_name: Name of the corpus to load
        min_sentences: Minimum number of sentences to include

    Returns:
        Tuple of (train_corpus, val_corpus, test_corpus)
    """
    print(f"Loading {corpus_name} corpus...")

    if corpus_name == 'reuters':
        from nltk.corpus import reuters
        sentences = [" ".join(reuters.words(fileid)) for fileid in reuters.fileids()]

        # Break into actual sentences
        all_sentences = []
        for text in sentences:
            all_sentences.extend(nltk.sent_tokenize(text))

    elif corpus_name == 'brown':
        from nltk.corpus import brown
        sentences = [" ".join(brown.words(fileid)) for fileid in brown.fileids()]

        # Break into actual sentences
        all_sentences = []
        for text in sentences:
            all_sentences.extend(nltk.sent_tokenize(text))

    else:
        raise ValueError(f"Unknown corpus: {corpus_name}")

    # Ensure we have enough sentences
    if len(all_sentences) < min_sentences:
        raise ValueError(f"Corpus {corpus_name} has only {len(all_sentences)} sentences, "
                         f"which is less than the required {min_sentences}.")

    # Shuffle sentences
    random.seed(42)
    random.shuffle(all_sentences)

    # Take a subset for faster processing if needed
    sentences_subset = all_sentences[:min_sentences]

    # Split into train, validation, and test sets (70%, 15%, 15%)
    train_size = int(0.7 * len(sentences_subset))
    val_size = int(0.15 * len(sentences_subset))

    train_corpus = sentences_subset[:train_size]
    val_corpus = sentences_subset[train_size:train_size + val_size]
    test_corpus = sentences_subset[train_size + val_size:]

    print(f"Corpus split: {len(train_corpus)} train, {len(val_corpus)} validation, {len(test_corpus)} test sentences")

    return train_corpus, val_corpus, test_corpus

In [None]:
"""
Run the full language modeling experiment.
"""
# Part 1
# Load and split corpus
train_corpus, val_corpus, test_corpus = load_and_split_corpus(corpus_name='reuters', min_sentences=90000)

# Initialize and train models
bigram_model = NGramLanguageModel(n=2, min_freq=10)
bigram_model.train(train_corpus)

trigram_model = NGramLanguageModel(n=3, min_freq=10)
trigram_model.train(train_corpus)

# Ensure both models use the same vocabulary
common_vocab = bigram_model.vocabulary.intersection(trigram_model.vocabulary)
bigram_model.vocabulary = common_vocab
trigram_model.vocabulary = common_vocab
bigram_model.vocabulary_size = len(common_vocab)
trigram_model.vocabulary_size = len(common_vocab)

print(f"Common vocabulary size: {len(common_vocab)}")

# Part 2
# Calculate cross-entropy and perplexity on validation set
print("\nEvaluating on validation set...")

bigram_ce_val = calculate_cross_entropy(bigram_model, val_corpus)
bigram_ppl_val = calculate_perplexity(bigram_ce_val)

trigram_ce_val = calculate_cross_entropy(trigram_model, val_corpus)
trigram_ppl_val = calculate_perplexity(trigram_ce_val)

print(f"Bigram model - Cross-entropy: {bigram_ce_val:.4f}, Perplexity: {bigram_ppl_val:.4f}")
print(f"Trigram model - Cross-entropy: {trigram_ce_val:.4f}, Perplexity: {trigram_ppl_val:.4f}")

# Calculate cross-entropy and perplexity on test set
print("\nEvaluating on test set...")

bigram_ce_test = calculate_cross_entropy(bigram_model, test_corpus)
bigram_ppl_test = calculate_perplexity(bigram_ce_test)

trigram_ce_test = calculate_cross_entropy(trigram_model, test_corpus)
trigram_ppl_test = calculate_perplexity(trigram_ce_test)

print(f"Bigram model - Cross-entropy: {bigram_ce_test:.4f}, Perplexity: {bigram_ppl_test:.4f}")
print(f"Trigram model - Cross-entropy: {trigram_ce_test:.4f}, Perplexity: {trigram_ppl_test:.4f}")

# Part 3
# Generate text completions
print("\nGenerating text completions:")

prompts = [
"I would like to",
"The president of",
"According to recent",
"In the last few",
"Experts say that"
]

print("\nBigram model completions:")
for prompt in prompts:
  prompt_tokens = nltk.word_tokenize(prompt.lower())

  # Generate with greedy decoding
  completion_greedy = generate_text(bigram_model, prompt_tokens, method="greedy")
  completion_text_greedy = prompt + " " + " ".join([w for w in completion_greedy if w != bigram_model.END])
  print(f"[Greedy] {completion_text_greedy}")

  # Generate with top-k sampling
  completion_topk = generate_text(bigram_model, prompt_tokens, method="topk", top_k=5, temperature=0.7)
  completion_text_topk = prompt + " " + " ".join([w for w in completion_topk if w != bigram_model.END])
  print(f"[Top-K] {completion_text_topk}")

  # Generate with beam search
  beam_completions = beam_search(bigram_model, prompt_tokens, beam_width=3)
  top_beam = beam_completions[0]
  completion_text_beam = prompt + " " + " ".join([w for w in top_beam if w != bigram_model.END])
  print(f"[Beam] {completion_text_beam}")
  print()

print("\nTrigram model completions:")
for prompt in prompts:
  prompt_tokens = nltk.word_tokenize(prompt.lower())

  # Generate with greedy decoding
  completion_greedy = generate_text(trigram_model, prompt_tokens, method="greedy")
  completion_text_greedy = prompt + " " + " ".join([w for w in completion_greedy if w != trigram_model.END])
  print(f"[Greedy] {completion_text_greedy}")

  # Generate with top-k sampling
  completion_topk = generate_text(trigram_model, prompt_tokens, method="topk", top_k=5, temperature=0.7)
  completion_text_topk = prompt + " " + " ".join([w for w in completion_topk if w != trigram_model.END])
  print(f"[Top-K] {completion_text_topk}")

  # Generate with beam search
  beam_completions = beam_search(trigram_model, prompt_tokens, beam_width=3)
  top_beam = beam_completions[0]
  completion_text_beam = prompt + " " + " ".join([w for w in top_beam if w != trigram_model.END])
  print(f"[Beam] {completion_text_beam}")
  print()

Loading reuters corpus...
Corpus split: 62999 train, 13500 validation, 13501 test sentences
Training 2-gram model on 62999 sentences...
Vocabulary size: 6273 words
Extracted 204067 unique 2-grams
Total tokens in corpus: 1156241
Training 3-gram model on 62999 sentences...
Vocabulary size: 6274 words
Extracted 530702 unique 3-grams
Total tokens in corpus: 1156241
Common vocabulary size: 6272

Evaluating on validation set...
Bigram model - Cross-entropy: 7.8639, Perplexity: 232.9501
Trigram model - Cross-entropy: 9.9089, Perplexity: 961.3537

Evaluating on test set...
Bigram model - Cross-entropy: 7.8611, Perplexity: 232.5016
Trigram model - Cross-entropy: 9.9101, Perplexity: 962.1620

Generating text completions:

Bigram model completions:
[Greedy] I would like to the company said .
[Top-K] I would like to be a share , 000 vs loss of the company said it is expected to be a share .
[Beam] I would like to the u .

[Greedy] The president of the company said .
[Top-K] The president of the co