In [5]:
import nltk
from nltk.util import ngrams
from nltk import word_tokenize, FreqDist
from collections import defaultdict
import math

In [4]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [6]:
# You can change this text to any dataset you like
text = """Natural language processing enables computers to understand human language.
It is an essential field in artificial intelligence.
Language models help in predicting the next word in a sentence."""

tokens = word_tokenize(text.lower())
print("Tokenized Text:", tokens)

Tokenized Text: ['natural', 'language', 'processing', 'enables', 'computers', 'to', 'understand', 'human', 'language', '.', 'it', 'is', 'an', 'essential', 'field', 'in', 'artificial', 'intelligence', '.', 'language', 'models', 'help', 'in', 'predicting', 'the', 'next', 'word', 'in', 'a', 'sentence', '.']


In [7]:
def build_ngram_model(tokens, n):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for i in range(len(tokens) - n + 1):
        context = tuple(tokens[i:i+n-1])
        word = tokens[i+n-1]
        model[context][word] += 1

    # Apply Laplace smoothing and normalize
    for context in model:
        total = sum(model[context].values()) + len(model[context])
        for word in model[context]:
            model[context][word] = (model[context][word] + 1) / total
    return model

In [8]:
unigram_model = build_ngram_model(tokens, 1)
bigram_model = build_ngram_model(tokens, 2)
trigram_model = build_ngram_model(tokens, 3)

In [9]:
def get_ngram_probability(model, context, word):
    context = tuple(context)
    if context in model and word in model[context]:
        return model[context][word]
    else:
        # Laplace smoothing for unseen word/context
        return 1 / (len(model[context]) + 1) if context in model else 1e-6

In [10]:
print("Testing Bigram Model:")
context = ["language"]
word = "models"
prob = get_ngram_probability(bigram_model, context, word)
print(f"P({word} | {' '.join(context)}) = {prob:.4f}")

print("\nTesting Trigram Model:")
context = ["in", "a"]
word = "sentence"
prob = get_ngram_probability(trigram_model, context, word)
print(f"P({word} | {' '.join(context)}) = {prob:.4f}")

Testing Bigram Model:
P(models | language) = 0.3333

Testing Trigram Model:
P(sentence | in a) = 1.0000


In [11]:
# Print sample bigram probabilities
print("\nSample Bigram Probabilities:")
for context in list(bigram_model.keys())[:5]:
    for word in bigram_model[context]:
        print(f"P({word} | {' '.join(context)}) = {bigram_model[context][word]:.4f}")


Sample Bigram Probabilities:
P(language | natural) = 1.0000
P(processing | language) = 0.3333
P(. | language) = 0.3333
P(models | language) = 0.3333
P(enables | processing) = 1.0000
P(computers | enables) = 1.0000
P(to | computers) = 1.0000
