In [1]:
import requests
import re
from collections import defaultdict

# Fetch books from Project Gutenberg
def get_book_text():
    # Use Pride and Prejudice as example
    url = "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"
    try:
        response = requests.get(url)
        text = response.text

        # Extract main content
        start = text.find("Chapter 1")
        end = text.find("End of the Project Gutenberg")

        if start != -1 and end != -1:
            text = text[start:end]

        # Clean the text
        text = re.sub(r'[^\w\s.,;!?\']', '', text)  # Keep basic punctuation
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        return text.lower()  # Convert to lowercase

    except:
        # Fallback text if fetch fails
        return """it is a truth universally acknowledged that a single man in
        possession of a good fortune must be in want of a wife however little
        known the feelings or views of such a man may be on his first entering
        a neighbourhood this truth is so well fixed in the minds of the
        surrounding families that he is considered as the rightful property of
        some one or other of their daughters my dear mr bennet said his lady
        to him one day have you heard that netherfield park is let at last
        mr bennet replied that he had not but it is returned she for mrs long
        has just been here and she told me all about it mr bennet made no
        answer do not you want to know who has taken it cried his wife
        impatiently you want to tell me and i have no objection to hearing it
        this was invitation enough"""

# Build 5-gram model
def build_5gram_model(text):
    words = text.split()

    # Create 5-grams
    ngrams = defaultdict(list)

    # For each position, create context (4 words) -> next word
    for i in range(len(words) - 4):
        context = tuple(words[i:i+4])  # First 4 words
        next_word = words[i+4]         # 5th word

        # Store that this word can follow this context
        ngrams[context].append(next_word)

    print(f"Total words: {len(words)}")
    print(f"Unique 5-gram contexts: {len(ngrams)}")

    return ngrams

# Generate text using 5-grams
def generate_text(model, start_words, length=30):
    # Convert starting words to list
    current = start_words.lower().split()

    # Ensure we have at least 4 words for context
    while len(current) < 4:
        # Find any context that starts with our words
        for context in model.keys():
            if context[0] == current[-1] if current else True:
                # Add words from this context
                for word in context:
                    if word not in current:
                        current.append(word)
                        if len(current) >= 4:
                            break
                break

    # Take last 4 words as current context
    current_context = tuple(current[-4:])

    # Generate words
    for _ in range(length):
        # Check if we know what comes after this context
        if current_context in model and model[current_context]:
            # Pick a random word that can follow this context
            next_word = model[current_context][0]  # Use first one for simplicity

            # In a better version, you'd pick based on frequency
            # next_word = random.choice(model[current_context])

            current.append(next_word)
            # Update context: remove first word, add new word
            current_context = tuple(current[-4:])
        else:
            # If we don't know this context, try shorter context
            # Use last 3 words as new context
            shorter_context = tuple(current[-3:])

            # Look for contexts that end with our shorter context
            possible_next = []
            for context in model.keys():
                if context[1:] == shorter_context and len(context) == 4:
                    possible_next.extend(model[context])

            if possible_next:
                next_word = possible_next[0]
                current.append(next_word)
                current_context = tuple(current[-4:])
            else:
                # If still no match, use last word as context
                last_word = current[-1]
                # Find any context starting with this word
                for context in model.keys():
                    if context[0] == last_word:
                        # Add the next word from this context
                        current.append(context[1])
                        current_context = tuple(current[-4:])
                        break
                else:
                    # If nothing works, add a common word
                    current.append("the")
                    current_context = tuple(current[-4:])

    # Join words back into text
    result = " ".join(current)

    # Basic capitalization
    result = result.capitalize()
    result = re.sub(r' i ', ' I ', result)
    result = re.sub(r' mr ', ' Mr. ', result)
    result = re.sub(r' mrs ', ' Mrs. ', result)

    # Add periods
    sentences = result.split()
    for i in range(len(sentences)):
        if i > 0 and i % 8 == 0 and i < len(sentences) - 1:
            sentences[i] = sentences[i] + "."

    return " ".join(sentences) + "."

# Main program
print("=== Simple 5-Gram Language Model ===\n")

# Get training text
print("Loading training text...")
text = get_book_text()

# Build model
print("Building 5-gram model...")
model = build_5gram_model(text)

# Show some examples
print("\nModel Examples:")
print("-" * 50)

# Example 1
seed1 = "it is a truth"
print(f"\nSeed: '{seed1}'")
generated1 = generate_text(model, seed1, 25)
print(f"Generated: {generated1}")

# Example 2
seed2 = "he was a man"
print(f"\nSeed: '{seed2}'")
generated2 = generate_text(model, seed2, 20)
print(f"Generated: {generated2}")

# Example 3
seed3 = "she could not help"
print(f"\nSeed: '{seed3}'")
generated3 = generate_text(model, seed3, 25)
print(f"Generated: {generated3}")

# Show what the model learned
print("\n" + "=" * 50)
print("What the model learned (sample 5-grams):")
print("=" * 50)

# Show first 10 contexts and what follows
sample_contexts = list(model.keys())[:10]
for i, context in enumerate(sample_contexts):
    words = list(context)
    next_words = model[context][:3]  # Show first 3 possible next words

    print(f"\nContext {i+1}: {' '.join(words)}")
    print(f"  Can be followed by: {', '.join(next_words)}")

# Test with some specific contexts
print("\n" + "=" * 50)
print("Testing specific predictions:")
print("=" * 50)

test_contexts = [
    ("it", "is", "a", "truth"),
    ("he", "was", "a", "man"),
    ("she", "could", "not", "help")
]

for context in test_contexts:
    if context in model:
        predictions = model[context][:5]  # Get first 5 predictions
        print(f"\nAfter '{' '.join(context)}', the model predicts:")
        print(f"  {', '.join(predictions)}")
    else:
        print(f"\nContext '{' '.join(context)}' not found in model")

# Generate a longer example
print("\n" + "=" * 50)
print("Longer generated example:")
print("=" * 50)

long_example = generate_text(model, "in a country town", 50)
print(f"\n{long_example}")

=== Simple 5-Gram Language Model ===

Loading training text...
Building 5-gram model...
Total words: 130343
Unique 5-gram contexts: 126880

Model Examples:
--------------------------------------------------

Seed: 'it is a truth'
Generated: It is a truth universally acknowledged, that a single. man in possession of a good fortune must. be in want of a wife. however little. known the feelings or.

Seed: 'he was a man'
Generated: He was a man in the early eighteenth century,. of course, could push this taste further than. a lady in the early nineteenth; and.

Seed: 'she could not help'
Generated: She could not help asking him whether he intended. to accept mr. bingleys invitation, and if he. did, whether he would think it proper to. join in the evenings.

What the model learned (sample 5-grams):

Context 1: the project gutenberg ebook
  Can be followed by: of, pride, pride

Context 2: project gutenberg ebook of
  Can be followed by: pride

Context 3: gutenberg ebook of pride
  Can be fol