**Home Exercise:**

a) Improve the model by using interpolation smoothing with the "Stupid Backoff" method (Brants et al., 2007).

b) Compare with the results from In Class Exercise.

c) Use the newly built model to generate the next words for a given word sequence.

d) Combine with a function that calculates the distance between words to predict the correct word for a misspelled word position. (from difflib import get_close_matches)

# Import

In [1]:
%pip install gdown matplotlib nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# import sys
# import os
# import platform

# # Python environment details
# print("Python executable being used:", sys.executable)
# print("Python version:", sys.version)

# # Operating System details
# print("Operating System:", platform.system())
# print("OS Version:", platform.version())
# print("OS Release:", platform.release())

# # Machine and architecture details
# print("Machine:", platform.machine())

# # Visual Studio Code details (based on environment variable)
# vscode_info = os.environ.get('VSCODE_PID', None)
# if vscode_info:
#     print("Running in Visual Studio Code")
# else:
#     print("Not running in Visual Studio Code")

Python executable being used: c:\Python312\python.exe
Python version: 3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
Operating System: Windows
OS Version: 10.0.19045
OS Release: 10
Machine: AMD64
Running in Visual Studio Code


In [3]:
import os
from collections import defaultdict
import math
import gdown
import re
import matplotlib.pyplot as plt
from difflib import get_close_matches
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer model
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Function Definitions

## Load data

In [4]:
def clean_sentence(sentence):
    """
    Clean a sentence by removing unwanted characters like '-', '(', ')', etc.
    """
    cleaned = re.sub(r'[\(\)-]', '', sentence)  # Remove specific characters
    cleaned = re.sub(r'[^\w\s.,!?]', '', cleaned)  # Keep only alphanumeric, spaces, and basic punctuation
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()  # Remove extra spaces and trim
    
    return cleaned


In [5]:
def load_data(filepath):
    """
    Load and preprocess text data from a file.
    Split the data into sentences using nltk's sentence tokenizer.
    """
    with open(filepath, "r", encoding="utf-8") as file:
        text = file.read()
    
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Clean each sentence
    cleaned_sentences = [clean_sentence(sentence) for sentence in sentences]

    # Debug: Print the first 10 sentences
    # print("First 10 sentences from the file:")
    # for i, sentence in enumerate(cleaned_sentences[:10], start=1):
    #     print(f"{i}: {sentence}")
    
    return cleaned_sentences  # Return the list of tokenized sentences


## Build n-gram model

In [6]:
def build_ngram_model(corpus, n):
    """
    Build an n-gram model from a given corpus and given n.
    """
    ngram_counts = defaultdict(int)
    n_minus_1_counts = defaultdict(int)
    vocabulary = set()

    for sentence in corpus:
        # Tokenize and add padding based on n
        tokens = sentence.split()  # Tokenize the sentence
        if n > 1:
            tokens = (["<s>"] * (n - 1)) + tokens + (["</s>"] * (n - 1))
        
        # Update vocabulary
        vocabulary.update(tokens)

        # Generate n-grams and (n-1)-grams
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            n_minus_1_gram = tuple(tokens[i:i + n - 1])
            ngram_counts[ngram] += 1
            n_minus_1_counts[n_minus_1_gram] += 1

    return ngram_counts, n_minus_1_counts, len(vocabulary)

## Compute Sentence Probabilities with laplace smoothed probabilities

In [7]:
def compute_laplace_probability(ngram, ngram_counts, n_minus_1_counts, vocab_size):
    """
    Compute the Laplace smoothed probability of an n-gram.
    """
    ngram_count = ngram_counts[ngram]
    n_minus_1_count = n_minus_1_counts[ngram[:-1]] if len(ngram) > 1 else sum(ngram_counts.values())
    return (ngram_count + 1) / (n_minus_1_count + vocab_size)

In [8]:
def sentence_probability(sentence, ngram_counts, n_minus_1_counts, vocab_size, n):
    """
    Compute the probability of a sentence using an n-gram model with Laplace smoothing.
    Debug statements added to trace computation.
    """
    # Add padding based on n-gram size
    if n > 1:
        tokens = ["<s>"] * (n - 1) + sentence.split() + ["</s>"] * (n - 1)
    else:
        tokens = sentence.split()

    print(f"Tokens with padding for n={n}: {tokens}")  

    prob = 1.0  # Initialize probability
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i + n])
        ngram_prob = compute_laplace_probability(ngram, ngram_counts, n_minus_1_counts, vocab_size)
        print(f"N-gram: {ngram}, Probability: {ngram_prob}")  

        prob *= ngram_prob

    print(f"Final sentence probability: {prob}")  
    return prob


In [9]:
def compute_perplexity(sentence, ngram_counts, n_minus_1_counts, vocab_size, n):
    """
    Compute the perplexity of a sentence using an n-gram model.
    """
    # Add padding based on n-gram size
    if n > 1:
        tokens = ["<s>"] * (n - 1) + sentence.split() + ["</s>"] * (n - 1)
    else:
        tokens = sentence.split()
        
    prob = 0.0
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i + n])
        prob += math.log(compute_laplace_probability(ngram, ngram_counts, n_minus_1_counts, vocab_size))
    prob = -prob / len(tokens)
    return math.exp(prob)

## Compute Sentence Probabilities with Stupid Backoff method

In [10]:
def compute_stupid_backoff_probability(ngram, ngram_models, alpha=0.4):
    """
    Compute the Stupid Backoff score for a given n-gram using proper backoff logic.
    
    Args:
        ngram (tuple): The n-gram to compute the score for.
        ngram_models (dict): A dictionary where keys are n (order of n-grams) and values are tuples of 
                             (ngram_counts, n_minus_1_counts, vocab_size).
        alpha (float): The backoff factor (default is 0.4).

    Returns:
        float: The Stupid Backoff score of the n-gram.
    """
    # print(f"Computing Stupid Backoff for n-gram: {ngram}")  # Debug: Initial n-gram
    
    # Get the highest available n-gram model
    ngram_model = ngram_models.get(len(ngram))
    
    if ngram_model:
        ngram_counts, n_minus_1_counts, _ = ngram_model
        
        # Check if the highest-order n-gram exists
        if ngram in ngram_counts and ngram_counts[ngram] > 0:
            ngram_count = ngram_counts[ngram]
            n_minus_1_count = n_minus_1_counts.get(ngram[:-1], 0) if len(ngram) > 1 else sum(ngram_counts.values())

            if n_minus_1_count > 0:
                prob = ngram_count / n_minus_1_count
                # print(f"Probability for n-gram {ngram}: {prob}")
                return prob

    # Backoff: If the n-gram was not found, reduce the order and apply backoff
    if len(ngram) > 1:
        # print(f"Backing off from n-gram: {ngram}")
        backoff_prob = alpha * compute_stupid_backoff_probability(ngram[1:], ngram_models, alpha)
        # print(f"Backoff probability for n-gram {ngram}: {backoff_prob}")
        return backoff_prob

    # Base case: Compute unigram score
    unigram_model = ngram_models.get(1)
    if unigram_model:
        unigram_counts, _, total_count = unigram_model
        unigram_count = unigram_counts.get((ngram[-1],), 0)
        
        if unigram_count > 0:
            # Compute and return the unigram probability
            if total_count > 0:
                prob = unigram_count / total_count
                # print(f"Unigram probability for {ngram[-1]}: {prob}")
                return prob
            
        if ngram == ('</s>',):
            # print("Unigram probability for </s>: 1.0")
            return 1.0
    
    # Fallback: Return alpha if all else fails
    # print(f"No match for n-gram {ngram}, returning alpha = {alpha}")
    return alpha


In [None]:
def sentence_probability_with_backoff(sentence, ngram_models, alpha=0.4):
    """
    Compute the probability of a sentence using Stupid Backoff smoothing with detailed debugging.
    """
    max_n = max(ngram_models.keys())
    
    tokens = ["<s>"] * (max_n - 1) + sentence.split() + ["</s>"] * (max_n - 1)
    print(f"Tokens with padding for max_n={max_n}: {tokens}")  

    # Initialize the probability
    prob = 1.0  

    # Iterate through the tokens to compute probabilities for each n-gram
    for i in range(len(tokens) - max_n + 1):
        # Extract the n-gram of maximum size
        ngram = tuple(tokens[i:i + max_n])
        ngram_prob = compute_stupid_backoff_probability(ngram, ngram_models, alpha)
        prob *= ngram_prob

    print(f"Final sentence probability with backoff: {prob}")  
    return prob


In [None]:
def compute_perplexity_with_backoff(sentence, ngram_models, alpha=0.4):
    """
    Compute the perplexity of a sentence using Stupid Backoff smoothing.
    """
    max_n = max(ngram_models.keys()) 
    tokens = ["<s>"] * (max_n - 1) + sentence.split() + ["</s>"] * (max_n - 1)

    log_prob = 0.0  

    # Iterate through tokens to compute log probabilities
    for i in range(len(tokens) - max_n + 1):
        ngram = tuple(tokens[i:i + max_n])
        ngram_prob = compute_stupid_backoff_probability(ngram, ngram_models, alpha)

        if ngram_prob > 0:
            log_prob += math.log(ngram_prob)
        else:
            log_prob += math.log(alpha)  

    N = len(tokens) - (max_n - 1)  
    log_prob = -log_prob / N  
    perplexity = math.exp(log_prob)  

    print(f"Final Perplexity with Backoff: {perplexity}")  
    return perplexity


## Compare sentences by using soothing with laplace and with stupid backoff

In [13]:
def compare_sentences_with_smoothing(sentence1, sentence2, ngram_models, alpha=0.4):
    """
    Compare probabilities and perplexities of two sentences using both Laplace and Stupid Backoff.
    Determines which sentence is more likely based on both smoothing methods.
    """
    results = {"Laplace": {}, "Stupid Backoff": {}}

    # Compare using Laplace smoothing
    for n in range(1, len(ngram_models) + 1):
        ngram_counts, n_minus_1_counts, vocab_size = ngram_models[n]
        
        print("-----------------------------------")
        print("DEBUG: Comparing sentences using Laplace smoothing for", n, "-gram model")

        sentence1_prob_laplace = sentence_probability(sentence1, ngram_counts, n_minus_1_counts, vocab_size, n)
        sentence2_prob_laplace = sentence_probability(sentence2, ngram_counts, n_minus_1_counts, vocab_size, n)        

        sentence1_perplexity_laplace = compute_perplexity(sentence1, ngram_counts, n_minus_1_counts, vocab_size, n)
        sentence2_perplexity_laplace = compute_perplexity(sentence2, ngram_counts, n_minus_1_counts, vocab_size, n)

        # Determine which sentence has higher probability
        if sentence1_perplexity_laplace < sentence2_perplexity_laplace:
            higher_sentence = "Sentence 1 (Correct)"
        elif sentence2_perplexity_laplace < sentence1_perplexity_laplace:
            higher_sentence = "Sentence 2 (Incorrect)"
        else:
            higher_sentence = "Both sentences have equal probability"

        # Store Laplace comparison results
        results["Laplace"][f"{n}-gram"] = {
            "sentence1": {
                "probability": sentence1_prob_laplace,
                "perplexity": sentence1_perplexity_laplace,
            },
            "sentence2": {
                "probability": sentence2_prob_laplace,
                "perplexity": sentence2_perplexity_laplace,
            },
            "higher_probability": higher_sentence,
            "probability_difference": abs(sentence1_prob_laplace - sentence2_prob_laplace),
            "perplexity_difference": abs(sentence1_perplexity_laplace - sentence2_perplexity_laplace),
        }
        
    print("-----------------------------------")
    print("DEBUG: Comparing sentences using Stupid Backoff smoothing for", n, "-gram model")
    
    # Compare using Stupid Backoff smoothing
    sentence1_prob_backoff = sentence_probability_with_backoff(sentence1, ngram_models, alpha)
    sentence1_perplexity_backoff = compute_perplexity_with_backoff(sentence1, ngram_models, alpha)
    
    print("\n")
    
    sentence2_prob_backoff = sentence_probability_with_backoff(sentence2, ngram_models, alpha)
    sentence2_perplexity_backoff = compute_perplexity_with_backoff(sentence2, ngram_models, alpha)

    # Determine which sentence has higher probability under Backoff
    if sentence1_perplexity_backoff < sentence2_perplexity_backoff:
        higher_sentence_backoff = "Sentence 1 (Correct)"
    elif sentence2_perplexity_backoff < sentence1_perplexity_backoff:
        higher_sentence_backoff = "Sentence 2 (Incorrect)"
    else:
        higher_sentence_backoff = "Both sentences have equal probability"

    # Store Backoff comparison results
    results["Stupid Backoff"] = {
        "sentence1": {
            "probability": sentence1_prob_backoff,
            "perplexity": sentence1_perplexity_backoff,
        },
        "sentence2": {
            "probability": sentence2_prob_backoff,
            "perplexity": sentence2_perplexity_backoff,
        },
        "higher_probability": higher_sentence_backoff,
        "probability_difference": abs(sentence1_prob_backoff - sentence2_prob_backoff),
        "perplexity_difference": abs(sentence1_perplexity_backoff - sentence2_perplexity_backoff),
    }

    return results


## Function for predicting the next top-k tokens

The Stupid Backoff method scores n-grams using relative frequencies but does **not normalize** probabilities to sum to 1. It uses a backoff mechanism to handle cases where an n-gram does not exist in the training data.

### Backoff Strategy
- Start with the highest-order n-gram (e.g., trigram).
- If the n-gram is **not found**, back off to the next lower-order n-gram (e.g., bigram).
- Continue until the unigram model is reached.
- If no match is found, return the **backoff factor** ($\alpha$).

### Backoff Factor ($\alpha$)
- $\alpha$ is a constant multiplier used for scoring lower-order n-grams.
- It is typically set to $0.4$, as per the original implementation of Stupid Backoff.

---

## 2. Combined Stupid Backoff with Optional Laplace Smoothing

The code combines **Stupid Backoff** with an option for **Laplace Smoothing** to handle zero probabilities.

### Steps:
1. **Start with the Highest n-gram**:
   - Look for the n-gram in the training data.
   - If found:
     - Use Laplace Smoothing:  
       $
       P(w_i|w_{i-(n-1)}, ..., w_{i-1}) = \frac{\text{Count}(w_i, w_{i-(n-1)}, ..., w_{i-1}) + 1}{\text{Count}(w_{i-(n-1)}, ..., w_{i-1}) + V}
       $
       Where $V$ is the vocabulary size.
     - Otherwise, return the relative frequency:
       $
       P(w_i|w_{i-(n-1)}, ..., w_{i-1}) = \frac{\text{Count}(w_i, w_{i-(n-1)}, ..., w_{i-1})}{\text{Count}(w_{i-(n-1)}, ..., w_{i-1})}
       $

2. **Backoff to Lower-Order n-grams**:
   - If the n-gram is not found, reduce the order (e.g., backoff from trigram to bigram).
   - Repeat the probability calculation for the lower-order n-gram.

3. **Fallback to Unigram**:
   - If no n-grams match, compute the unigram probability:
     - With Laplace Smoothing:
       $
       P(w_i) = \frac{\text{Count}(w_i) + 1}{\text{Total Words} + V}
       $
     - Without Laplace Smoothing:
       $
       P(w_i) = \frac{\text{Count}(w_i)}{\text{Total Words}}
       $

4. **Return Backoff Factor**:
   - If the word is entirely unseen, return $\alpha$

In [14]:
def predict_next_word(sequence, ngram_models, alpha=0.4, top_k=5):
    """
    Predict the next word for a given sequence using both Laplace smoothing and Stupid Backoff.
    Uses `compute_laplace_probability()` for Laplace smoothing.
    
    Returns:
        dict: Top-k predicted words and their scores.
    """
    sequence_tokens = sequence.split()
    candidates_laplace = defaultdict(float)
    candidates_backoff = defaultdict(float)

    print(f"\nInput sequence: '{sequence}'")
    print(f"Tokens: {sequence_tokens}")

    # Iterate over n-grams (bigram and trigram models)
    for n in range(len(ngram_models), 1, -1):  # Only checking for n=3 and n=2
        ngram_model = ngram_models.get(n)
        if not ngram_model:
            continue  # Skip if this n-gram model doesn't exist

        ngram_counts, n_minus_1_counts, vocab_size = ngram_model
        if len(sequence_tokens) < (n - 1):
            continue  # Not enough context for this n-gram

        prefix = tuple(sequence_tokens[-(n - 1):])  # Extract prefix

        print(f"\nSearching for {n}-grams with prefix: {prefix}")

        # Filter n-grams matching the prefix
        matching_ngrams = {ngram: count for ngram, count in ngram_counts.items() if ngram[:-1] == prefix}

        if not matching_ngrams:
            print(f"No {n}-grams found for prefix {prefix}, backing off...")
            continue  # Move to the next lower n-gram

        for ngram, count in matching_ngrams.items():
            word = ngram[-1]  # Get the predicted word

            # Use `compute_laplace_probability()` instead of manual Laplace smoothing
            laplace_prob = compute_laplace_probability(ngram, ngram_counts, n_minus_1_counts, vocab_size)
            candidates_laplace[word] += laplace_prob

            # Stupid Backoff Probability
            backoff_prob = compute_stupid_backoff_probability(ngram, ngram_models, alpha)
            candidates_backoff[word] += backoff_prob

            print(f"Match found: {ngram}, Count: {count}, Laplace Score: {laplace_prob:.6f}, Backoff Score: {backoff_prob:.6f}")

    # Sort candidates by their scores
    sorted_candidates_laplace = sorted(candidates_laplace.items(), key=lambda x: x[1], reverse=True)
    sorted_candidates_backoff = sorted(candidates_backoff.items(), key=lambda x: x[1], reverse=True)

    print("\nTop candidate words (Laplace Smoothing):")
    for word, score in sorted_candidates_laplace[:top_k]:
        print(f"Word: {word}, Score: {score:.6f}")

    print("\nTop candidate words (Stupid Backoff):")
    for word, score in sorted_candidates_backoff[:top_k]:
        print(f"Word: {word}, Score: {score:.6f}")

    return {
        "Laplace": sorted_candidates_laplace[:top_k],
        "Stupid Backoff": sorted_candidates_backoff[:top_k]
    }


## Function for correcting the misspelled word

In [15]:
def correct_misspelled_word(word, vocabulary, ngram_models, context, alpha=0.4, top_k=5):
    """
    Predict the correct word for a misspelled word using both Laplace and Stupid Backoff.
    Returns:
        dict: Corrected words and scores from both smoothing methods.
    """
    # Step 1: Use difflib to get similar words
    similar_words = get_close_matches(word, vocabulary, n=top_k)

    if not similar_words:
        # No similar words found; return the original word
        return {"Laplace": word, "Stupid Backoff": word}

    # Step 2: Rank similar words using both smoothing methods
    word_scores_laplace = []
    word_scores_backoff = []

    for candidate in similar_words:
        # Add the candidate to the context for scoring
        sequence_tokens = context.split() + [candidate]
        sequence = " ".join(sequence_tokens)

        # Compute Laplace probability
        laplace_prob = sentence_probability(sequence, *ngram_models[max(ngram_models.keys())], max(ngram_models.keys()))

        # Compute Stupid Backoff probability
        backoff_prob = sentence_probability_with_backoff(sequence, ngram_models, alpha)

        word_scores_laplace.append((candidate, laplace_prob))
        word_scores_backoff.append((candidate, backoff_prob))

    # Sort candidates by probability
    word_scores_laplace.sort(key=lambda x: x[1], reverse=True)
    word_scores_backoff.sort(key=lambda x: x[1], reverse=True)

    # Display top-k ranked results
    print(f"\nTop-{len(word_scores_laplace)} Predictions for '{word}' (Laplace):")
    for rank, (candidate, prob) in enumerate(word_scores_laplace, start=1):
        print(f"  {rank}. {candidate} - Probability: {prob:.10e}")

    print(f"\nTop-{len(word_scores_backoff)} Predictions for '{word}' (Stupid Backoff):")
    for rank, (candidate, prob) in enumerate(word_scores_backoff, start=1):
        print(f"  {rank}. {candidate} - Probability: {prob:.10e}")

    return {
        "Laplace": word_scores_laplace[0][0] if word_scores_laplace else word,
        "Stupid Backoff": word_scores_backoff[0][0] if word_scores_backoff else word
    }


## Function for building first n-gram models

In [16]:
def build_ngram_models(filepath, n):
    """
    Build n-gram models 

    Args:
        filepath (str): Path to the dataset.

    Returns:
        dict: Dictionary of n-gram models (ngram_counts, n_minus_1_counts, vocab_size) 
    """
    corpus = load_data(filepath)
    ngram_models = {}
    
    for n in range(1, n + 1):  # 1-gram, 2-gram, 3-gram, 4-gram, 5-gram, ... n-gram
        ngram_counts, n_minus_1_counts, vocab_size = build_ngram_model(corpus, n)
        ngram_models[n] = (ngram_counts, n_minus_1_counts, vocab_size)
    
    return ngram_models


# Main Function for doing the exercises

## Download tedtalk.txt

In [17]:
# Download tedtalk
url = "https://drive.google.com/file/d/1ZFXJVav0rZ0V2TadMuY0TxWuwxkhN-nq/view?usp=sharing"

def download_from_google_drive(url, output_filename=None):
    # Extract file ID using regex
    match = re.search(r"/d/([^/]+)", url)
    if not match:
        print("Error: Could not extract file ID from the URL.")
        return
    
    file_id = match.group(1)
    print(f"Extracted File ID: {file_id}")

    download_url = f"https://drive.google.com/uc?id={file_id}"

    if output_filename:
        gdown.download(download_url, output_filename, quiet=False)
    else:
        gdown.download(download_url, quiet=False)

url = "https://drive.google.com/file/d/1ZFXJVav0rZ0V2TadMuY0TxWuwxkhN-nq/view?usp=sharing"
download_from_google_drive(url, "tedtalk.txt")


Extracted File ID: 1ZFXJVav0rZ0V2TadMuY0TxWuwxkhN-nq


Downloading...
From: https://drive.google.com/uc?id=1ZFXJVav0rZ0V2TadMuY0TxWuwxkhN-nq
To: e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3085_NLP\BT\Lab03\tedtalk.txt
100%|██████████| 40.3M/40.3M [00:07<00:00, 5.05MB/s]


## Compare with the results from In Class Exercise.

This build_ngram_models function is just the same as in the In Class Exercise but now allow to put the params to let build n-gram models from 1-gram to n-gram

In this in class exercise, the function implementation of that function is just stick the loop from 1 -> 3-gram, now just small modification frm 3-gram -> n-gram is easy

In [18]:
dataset_path = os.path.join(os.getcwd(), "tedtalk.txt")

print("Building n-gram models...")
ngram_models = build_ngram_models(dataset_path, 5) # build from 1-gram -> n-gram models


Building n-gram models...


In [19]:
# Example sentences for comparison
correct_sentence = "the cat sat on the mat"  
incorrect_sentence = "cat the the on mat sat"

# Compare results
comparison_results = compare_sentences_with_smoothing(correct_sentence, incorrect_sentence, ngram_models)

# Display comparison results
print("\nComparison Results:")
for method, results in comparison_results.items():
    print(f"\n{method} Smoothing:")

    if method == "Laplace":
        # Loop through each n-gram level
        for ngram, data in results.items():
            print(f"{ngram}:")
            print(f"  Correct Sentence - Probability: {data['sentence1']['probability']}, Perplexity: {data['sentence1']['perplexity']}")
            print(f"  Incorrect Sentence - Probability: {data['sentence2']['probability']}, Perplexity: {data['sentence2']['perplexity']}")
            print(f"  Higher Probability: {data['higher_probability']}")
            print(f"  Probability Difference: {data['probability_difference']}")
            print(f"  Perplexity Difference: {data['perplexity_difference']}")

    elif method == "Stupid Backoff":
        # Print results for Stupid Backoff smoothing
        print(f"  Correct Sentence - Probability: {results['sentence1']['probability']}, Perplexity: {results['sentence1']['perplexity']}")
        print(f"  Incorrect Sentence - Probability: {results['sentence2']['probability']}, Perplexity: {results['sentence2']['perplexity']}")
        print(f"  Higher Probability: {results['higher_probability']}")
        print(f"  Probability Difference: {results['probability_difference']}")
        print(f"  Perplexity Difference: {results['perplexity_difference']}")

-----------------------------------
DEBUG: Comparing sentences using Laplace smoothing for 1 -gram model
Tokens with padding for n=1: ['the', 'cat', 'sat', 'on', 'the', 'mat']
N-gram: ('the',), Probability: 0.04309039618096174
N-gram: ('cat',), Probability: 2.1718949687984748e-05
N-gram: ('sat',), Probability: 6.228830853912608e-05
N-gram: ('on',), Probability: 0.005241643121868168
N-gram: ('the',), Probability: 0.04309039618096174
N-gram: ('mat',), Probability: 1.0927773427916854e-06
Final sentence probability: 1.4388166724576847e-20
Tokens with padding for n=1: ['cat', 'the', 'the', 'on', 'mat', 'sat']
N-gram: ('cat',), Probability: 2.1718949687984748e-05
N-gram: ('the',), Probability: 0.04309039618096174
N-gram: ('the',), Probability: 0.04309039618096174
N-gram: ('on',), Probability: 0.005241643121868168
N-gram: ('mat',), Probability: 1.0927773427916854e-06
N-gram: ('sat',), Probability: 6.228830853912608e-05
Final sentence probability: 1.438816672457685e-20
------------------------

So based on the output, the Laplace method and Stupid Backoff method both have the ability to recognize which sentence is "correct" (We just calculate the sentences probabilities and both model generate the probability for the correct-order sentence higher than the incorrect-order sentence)

## Use the newly built model to generate the next words for a given word sequence.

In [20]:
ngram_counts1, n_minus_1_counts1, vocab_size1 = ngram_models[1]
ngram_counts2, n_minus_1_counts2, vocab_size2 = ngram_models[2]
ngram_counts3, n_minus_1_counts3, vocab_size3 = ngram_models[3]

Here below, the first one is the `laplace` and the second one is `stupid backoff` prob of `P(the cat sleep | the cat)` 

The `laplace` is much smaller than the `stupid backoff` because it requires smoothing, and with `smoothing` it must divide with the `vocab_size` and the `vocab_size` is so big so it makes `laplace` much smaller

And for predicting the next words, we just take the word that together with the current sequence get the highest prob

In [21]:
print((ngram_counts3[('the', 'cat', 'sleep')] + 1) / (ngram_counts2[('the', 'cat')] + vocab_size3))
print((ngram_counts3[('the', 'cat', 'sleep')]) / (ngram_counts2[('the', 'cat')]))

1.224432322564451e-05
0.037037037037037035


Here I let the model predict the next word based on the current sequence of preceding words

In [22]:
# Predict the next word
sequence = "Today is a beautiful day. In the house the cat is"

# Predict next words using both smoothing methods
predicted_words = predict_next_word(sequence, ngram_models, alpha=0.4, top_k=5)



Input sequence: 'Today is a beautiful day. In the house the cat is'
Tokens: ['Today', 'is', 'a', 'beautiful', 'day.', 'In', 'the', 'house', 'the', 'cat', 'is']

Searching for 5-grams with prefix: ('house', 'the', 'cat', 'is')
No 5-grams found for prefix ('house', 'the', 'cat', 'is'), backing off...

Searching for 4-grams with prefix: ('the', 'cat', 'is')
Match found: ('the', 'cat', 'is', 'hearing'), Count: 1, Laplace Score: 0.000012, Backoff Score: 0.166667
Match found: ('the', 'cat', 'is', 'either'), Count: 1, Laplace Score: 0.000012, Backoff Score: 0.166667
Match found: ('the', 'cat', 'is', 'equal'), Count: 1, Laplace Score: 0.000012, Backoff Score: 0.166667
Match found: ('the', 'cat', 'is', 'a'), Count: 1, Laplace Score: 0.000012, Backoff Score: 0.166667
Match found: ('the', 'cat', 'is', 'alive'), Count: 1, Laplace Score: 0.000012, Backoff Score: 0.166667
Match found: ('the', 'cat', 'is', 'out'), Count: 1, Laplace Score: 0.000012, Backoff Score: 0.166667

Searching for 3-grams with

In [23]:
print(f"\nTop predictions for '{sequence}' (Laplace Smoothing):")
for rank, (word, score) in enumerate(predicted_words["Laplace"], start=1):
    print(f"{rank}. {word} - Score: {score:.6f}")

print(f"\nTop predictions for '{sequence}' (Stupid Backoff):")
for rank, (word, score) in enumerate(predicted_words["Stupid Backoff"], start=1):
    print(f"{rank}. {word} - Score: {score:.6f}")


Top predictions for 'Today is a beautiful day. In the house the cat is' (Laplace Smoothing):
1. a - Score: 0.039304
2. the - Score: 0.031762
3. that - Score: 0.022913
4. not - Score: 0.013615
5. to - Score: 0.009494

Top predictions for 'Today is a beautiful day. In the house the cat is' (Stupid Backoff):
1. a - Score: 0.408981
2. out - Score: 0.300837
3. either - Score: 0.233659
4. equal - Score: 0.233659
5. alive - Score: 0.233486


## Combine with a function that calculates the distance between words to predict the correct word for a misspelled word position. (from difflib import get_close_matches)

In [24]:
# Load the tokenized sentences from the dataset
tokenized_sentences = load_data("tedtalk.txt")

# Convert tokenized sentences into a vocabulary set (unique words)
vocabulary = set(word for sentence in tokenized_sentences for word in sentence.split())

print(f"Vocabulary Size: {len(vocabulary)}")  # Debug: Print vocabulary size
print(f"Sample Words: {list(vocabulary)[:20]}")  # Debug: Show some words


Vocabulary Size: 163312
Sample Words: ['disembark', 'Gestapo.', 'China', 'Waters?', 'SOS', 'deformities,', 'bonanza.', 'helpless.', 'marble?', 'Lepage', 'nomads', 'Blip', 'Film', 'arrived.', 'unexploited,', 'Reykjavik', 'lubrication', 'datagrabbing', 'Jonathans', 'breathe']


In [25]:
# Example misspelled word correction
misspelled_word = "beautfall"
context = "Today is such a"
corrected_word = correct_misspelled_word(misspelled_word, vocabulary, ngram_models, context, alpha=0.4, top_k=5)


Tokens with padding for n=5: ['<s>', '<s>', '<s>', '<s>', 'Today', 'is', 'such', 'a', 'befall', '</s>', '</s>', '</s>', '</s>']
N-gram: ('<s>', '<s>', '<s>', '<s>', 'Today'), Probability: 0.00026492550821269074
N-gram: ('<s>', '<s>', '<s>', 'Today', 'is'), Probability: 3.0585903568763227e-05
N-gram: ('<s>', '<s>', 'Today', 'is', 'such'), Probability: 6.123023794070464e-06
N-gram: ('<s>', 'Today', 'is', 'such', 'a'), Probability: 6.123173763425059e-06
N-gram: ('Today', 'is', 'such', 'a', 'befall'), Probability: 6.123173763425059e-06
N-gram: ('is', 'such', 'a', 'befall', '</s>'), Probability: 6.123173763425059e-06
N-gram: ('such', 'a', 'befall', '</s>', '</s>'), Probability: 6.123173763425059e-06
N-gram: ('a', 'befall', '</s>', '</s>', '</s>'), Probability: 6.123173763425059e-06
N-gram: ('befall', '</s>', '</s>', '</s>', '</s>'), Probability: 6.123173763425059e-06
Final sentence probability: 2.6149930701220953e-45
Tokens with padding for max_n=5: ['<s>', '<s>', '<s>', '<s>', 'Today', 'is

In [26]:
print(f"Misspelled Word: {misspelled_word}")
print(f"Corrected Word: {corrected_word}")

Misspelled Word: beautfall
Corrected Word: {'Laplace': 'beautifully.', 'Stupid Backoff': 'beautiful'}


In [27]:
misspelled_word = "healp"
context = "thank you very much for your"
corrected_word = correct_misspelled_word(misspelled_word, vocabulary, ngram_models, context, alpha=0.4, top_k=5)


Tokens with padding for n=5: ['<s>', '<s>', '<s>', '<s>', 'thank', 'you', 'very', 'much', 'for', 'your', 'help', '</s>', '</s>', '</s>', '</s>']
N-gram: ('<s>', '<s>', '<s>', '<s>', 'thank'), Probability: 1.6455000510105016e-06
N-gram: ('<s>', '<s>', '<s>', 'thank', 'you'), Probability: 6.123173763425059e-06
N-gram: ('<s>', '<s>', 'thank', 'you', 'very'), Probability: 6.123173763425059e-06
N-gram: ('<s>', 'thank', 'you', 'very', 'much'), Probability: 6.123173763425059e-06
N-gram: ('thank', 'you', 'very', 'much', 'for'), Probability: 8.57155102215746e-05
N-gram: ('you', 'very', 'much', 'for', 'your'), Probability: 8.57055402509948e-05
N-gram: ('very', 'much', 'for', 'your', 'help'), Probability: 6.122573930080206e-06
N-gram: ('much', 'for', 'your', 'help', '</s>'), Probability: 6.123173763425059e-06
N-gram: ('for', 'your', 'help', '</s>', '</s>'), Probability: 6.123173763425059e-06
N-gram: ('your', 'help', '</s>', '</s>', '</s>'), Probability: 6.123173763425059e-06
N-gram: ('help', '</s

In [28]:
print(f"Misspelled Word: {misspelled_word}")
print(f"Corrected Word: {corrected_word}")

Misspelled Word: healp
Corrected Word: {'Laplace': 'help?', 'Stupid Backoff': 'help?'}
