# Demo - N-Gram Text Generator 
Source: Heimskringla by Snorre Sturlason,
https://www.gutenberg.org/cache/epub/598/pg598.txt

*Generated by M.H. Skjelvareid using ChatGPT for code snippet generation*

In [1]:
# Imports
import random
import re
from collections import Counter, defaultdict

import ipywidgets as widgets
import requests
from IPython.display import HTML, clear_output, display


In [2]:
# Download the text from Project Gutenberg
url = "https://www.gutenberg.org/cache/epub/598/pg598.txt"
response = requests.get(url)
text = response.text

In [3]:
# Preprocessing - lowercase + keep only basic punctuation
text = text.lower()
text = re.sub(r"[^a-z0-9,.!?;:'\s-]", " ", text)
tokens = text.split()

print("Total tokens:", len(tokens))


Total tokens: 308654


In [None]:
# Define N-gram model functions and user interface
def build_ngram_model(tokens, n=2):
    """Builds an n-gram model given a token list."""
    model = defaultdict(Counter)  # Nested dictionary to hold counts

    for i in range(len(tokens) - n):
        context = tuple(tokens[i : i + n - 1])  # N-1 preceding words
        next_word = tokens[i + n - 1]  # Next word
        model[context][next_word] += 1  # Count occurrence for word in context

    return model


def choose_next_word(counter):
    """Choose a next word proportional to its frequency."""
    words, weights = zip(*counter.items())  # Next words and their counts (weights)
    return random.choices(words, weights, k=1)[0]  # Choose word based on weights


def generate_text(model, seed_text, n=2, length=100):
    seed_tokens = seed_text.lower().split()

    if len(seed_tokens) < n - 1:
        print(f"⚠️ Please enter at least {n - 1} start words.")
        return ""

    current = tuple(seed_tokens[-(n - 1) :])
    output = list(seed_tokens)

    for _ in range(length):
        if current not in model:
            break
        next_word = choose_next_word(model[current])
        output.append(next_word)
        current = tuple(output[-(n - 1) :])

    return " ".join(output)


# Dropdown for N-gram selection
n_widget = widgets.Dropdown(
    options=[2, 3, 4],
    value=2,
    description="N-gram:",
    style={"description_width": "60px"},
)

# Input box for start text
seed_widget = widgets.Text(
    value="",
    description="Seed:",
    placeholder="Enter start text...",
    style={"description_width": "60px"},
    layout=widgets.Layout(width="500px"),
)

# Generate button
generate_button = widgets.Button(description="Generate", button_style="success")

# Output area for generated text
output_area = widgets.Output()


def on_generate_clicked(b):
    with output_area:
        clear_output()
        n = n_widget.value
        seed = seed_widget.value

        if len(seed.split()) < n - 1:
            print(f"⚠️ Please enter at least {n - 1} words.")
            return

        print("✅ Building model...")
        model = build_ngram_model(tokens, n=n)

        generated = generate_text(model, seed, n=n, length=60)

        print("\nGenerated Text:\n")
        display(
            HTML(f"<div style='width:600px; white-space:normal;'>{generated}</div>")
        )


generate_button.on_click(on_generate_clicked)


In [5]:
# Display user interface
display(n_widget, seed_widget, generate_button, output_area)


Dropdown(description='N-gram:', options=(2, 3, 4), style=DescriptionStyle(description_width='60px'), value=2)

Text(value='', description='Seed:', layout=Layout(width='500px'), placeholder='Enter start text...', style=Tex…

Button(button_style='success', description='Generate', style=ButtonStyle())

Output()