<a href="https://colab.research.google.com/github/maheenunzeelah/21k4177and21k4176sNLPProject/blob/main/n_gram_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import spacy
import random

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    """Tokenize text using spaCy and filter out punctuation and spaces."""
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_punct and not token.is_space]

def train_ngram_model(text, n):
    tokens = tokenize(text)
    ngrams = {}
    context_counts = {}

    for i in range(len(tokens) - n + 1):
        context = tuple(tokens[i:i+n-1])
        next_word = tokens[i+n-1]

        # Initialize the dictionary for the context if it doesn't exist
        if context not in ngrams:
            ngrams[context] = {}
            context_counts[context] = 0

        # Update the next_word count for this context
        if next_word not in ngrams[context]:
            ngrams[context][next_word] = 0

        ngrams[context][next_word] += 1
        context_counts[context] += 1

    return ngrams, context_counts

def predict_next(ngrams, context_counts, context):
    """
    Given a context (list of words), return a probability distribution of next words.
    """
    context = tuple(word.lower() for word in context)
    counts = ngrams.get(context)

    if not counts:
        return {}
    total = float(context_counts[context])

    return {word: count/total for word, count in counts.items()}

def generate_text(ngrams, context_counts, seed, n, length=10):
    """
    Generate text starting from a seed context.
    """
    seed = [word.lower() for word in seed]  # Ensure seed is in lowercase


    if len(seed) != n - 1:
        raise ValueError(f"Seed must have {n-1} words.")

    output = seed.copy()
    for _ in range(length):
        context = tuple(output[-(n-1):])
        probs = predict_next(ngrams, context_counts, context)
        if not probs:
            break  # no predictions available
        next_word = random.choices(list(probs.keys()), weights=probs.values())[0]
        output.append(next_word)
    return ' '.join(output)

sample_text = (
        "Natural language processing with spaCy is both powerful and efficient. "
        "Language models can be built using n-grams to predict the next word in a sentence. "
        "The n-gram language model is a simple yet effective approach."
    )

  # Set n for the n-gram model (e.g., 3 for trigram)
n = 3
ngrams, context_counts = train_ngram_model(sample_text, n)


# Define a seed (must have n-1 words)
seed = ['Natural','language']

# # Predict next word probabilities for the seed
predictions = predict_next(ngrams, context_counts, seed)
print("Predicted probabilities for next word:", predictions)

# # Generate text starting from the seed
generated = generate_text(ngrams, context_counts, seed, n, length=15)
print("\nGenerated text:\n", generated)


Predicted probabilities for next word: {'processing': 1.0}

Generated text:
 natural language processing with spacy is both powerful and efficient language models can be built using n
