In [2]:
import numpy as np
from collections import defaultdict, Counter

class NgramModel:
    def __init__(self, n):
        self.n = n
        self.ngrams = defaultdict(Counter)
        self.vocab = set()

    def train(self, text):
        # Convert text to list of tokens
        tokens = text.lower().split()
        self.vocab.update(tokens)
        
        # Create n-grams
        for i in range(len(tokens) - self.n + 1):
            context = tuple(tokens[i:i+self.n-1])
            target = tokens[i+self.n-1]
            self.ngrams[context][target] += 1

    def predict_next(self, context):
        context = tuple(context.lower().split()[-self.n+1:])
        if context in self.ngrams:
            candidates = self.ngrams[context]
            total = sum(candidates.values())
            return {word: count/total for word, count in candidates.items()}
        return {}

# Example usage with longer text
text = "the cat sat on the mat while the dog sat on the floor the cat jumped over the fence and the dog chased after the cat but the cat was too quick and climbed up the tree while the dog barked at the bottom of the tree the cat looked down at the dog with amusement"
model = NgramModel(3)  # trigram model
model.train(text)

# Predict next word
context = "the cat"
probabilities = model.predict_next(context)
print(f"After '{context}', probabilities:")
for word, prob in probabilities.items():
    print(f"{word}: {prob:.2f}")

After 'the cat', probabilities:
sat: 0.20
jumped: 0.20
but: 0.20
was: 0.20
looked: 0.20


In [5]:
import random
from collections import defaultdict, Counter

class BigramModel:
    def __init__(self):
        self.bigrams = defaultdict(Counter)
        self.vocab = set()

    def train(self, text):
        # Convert text to list of tokens
        tokens = text.lower().split()
        self.vocab.update(tokens)
        
        # Create bigrams
        for i in range(len(tokens) - 1):
            context = tokens[i]
            target = tokens[i + 1]
            self.bigrams[context][target] += 1

    def predict_next(self, context):
        if context in self.bigrams:
            candidates = self.bigrams[context]
            total = sum(candidates.values())
            return {word: count / total for word, count in candidates.items()}
        return {}

    def sentence_probability(self, sentence):
        tokens = sentence.lower().split()
        prob = 1.0
        for i in range(len(tokens) - 1):
            context = tokens[i]
            target = tokens[i + 1]
            if context in self.bigrams and target in self.bigrams[context]:
                prob *= self.bigrams[context][target] / sum(self.bigrams[context].values())
            else:
                prob *= 0.0
        return prob

# Generate training data (1000 words)
words = ["the", "cat", "sat", "on", "the", "mat", "while", "dog", "jumped", "over", "fence", "chased", "after", "quick", "climbed", "up", "tree", "barked", "at", "bottom", "looked", "down", "with", "amusement"]
training_text = " ".join(random.choices(words, k=1000))

# Train the bigram model
bigram_model = BigramModel()
bigram_model.train(training_text)

# Test sentences
test_sentences = ["the cat", "the dog", "the cat sat", "the dog barked"]

# Estimate probabilities for test sentences
for sentence in test_sentences:
    prob = bigram_model.sentence_probability(sentence)
    print(f"Probability of '{sentence}': {prob:.6f}")

Probability of 'the cat': 0.053333
Probability of 'the dog': 0.026667
Probability of 'the cat sat': 0.004354
Probability of 'the dog barked': 0.000000
