**ASS NO-7 Write a better auto-complete algorithm using an N-gram model (similar models are used for translation, determining the author of a text, and speech recognition)**

In [None]:
import re
from collections import defaultdict, Counter

In [None]:
print('Imports complete.')
# Tokenize text: lowercase, remove non-letter characters, and split into words
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

Imports complete.


In [None]:
# Example usage:
sample_text = "Hello, world! This is a test."
tokens = tokenize(sample_text)
print('Tokens:', tokens)
# Build an n-gram model (n must be >= 2)
def build_ngram_model(tokens, n):
    model = defaultdict(Counter)
    for i in range(len(tokens) - n + 1):
        context = tuple(tokens[i:i+n-1])
        next_word = tokens[i+n-1]
        model[context][next_word] += 1
    return model

Tokens: ['hello', 'world', 'this', 'is', 'a', 'test']


In [None]:
# Build a trigram (n=3) model for demonstration
corpus = "Once upon a time in a land far away there was a mysterious forest. The forest was filled with ancient trees and hidden secrets. Many travelers ventured into the forest seeking adventure and wisdom. Some never returned and their stories became legends."
tokens = tokenize(corpus)
n = 3
ngram_model = build_ngram_model(tokens, n)

print('Trigram model built. Sample context and counts:')
sample_context = tuple(tokens[0:n-1])
print(sample_context, ngram_model[sample_context])
# Predict next words given a context using add-one smoothing
def predict_next_words(model, context, n, top_k=5):
    if len(context) != n - 1:
        raise ValueError(f"Context must have {n-1} words")
    context = tuple(context)
    predictions = model.get(context, {})

    # Build vocabulary from all counts in the model
    vocab = set()
    for counts in model.values():
        vocab.update(counts.keys())
    vocab_size = len(vocab)

    # Total count for this context
    total = sum(predictions.values())

    # Compute probability with Laplace smoothing
    scores = {}
    for word in vocab:
        count = predictions.get(word, 0)
        scores[word] = (count + 1) / (total + vocab_size)

    # Return the top_k words sorted by probability
    sorted_preds = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_preds[:top_k]

# Example prediction using context from the corpus
context = ["the", "forest"]
predictions = predict_next_words(ngram_model, context, n, top_k=5)
print('Auto-complete predictions for context', context)
for word, prob in predictions:
    print(f"{word}: {prob:.3f}")
# Function to simulate auto-completion
def auto_complete(model, context, n, max_words=5):
    completed = context.copy()
    for _ in range(max_words):
        predictions = predict_next_words(model, completed[-(n-1):], n, top_k=1)
        if not predictions:
            break
        next_word = predictions[0][0]
        completed.append(next_word)
    return ' '.join(completed)

# Demo auto-completion starting from a given context
start_context = ["many", "travelers"]
completed_text = auto_complete(ngram_model, start_context, n, max_words=10)
print("Auto-completed text:", completed_text)


Trigram model built. Sample context and counts:
('once', 'upon') Counter({'a': 1})
Auto-complete predictions for context ['the', 'forest']
was: 0.059
seeking: 0.059
secrets: 0.029
their: 0.029
away: 0.029
Auto-completed text: many travelers ventured into the forest was filled with ancient trees and
