In [2]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import defaultdict, Counter
import math

# Download required resources
nltk.download('punkt')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab') # This line was added to download the missing data

# Sample training text (can be replaced with a file or corpus)
text = """
Natural language processing is a subfield of linguistics computer science and artificial intelligence.
It is concerned with the interactions between computers and human language.
In particular, how to program computers to process and analyze large amounts of natural language data.
"""

# Tokenize the text
tokens = word_tokenize(text.lower())

# Function to build n-gram model with Laplace smoothing
def build_ngram_model(tokens, n):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    padded_tokens = ['<s>'] * (n - 1) + tokens + ['</s>']

    for i in range(len(padded_tokens) - n + 1):
        context = tuple(padded_tokens[i:i + n - 1])
        word = padded_tokens[i + n - 1]
        model[context][word] += 1

    # Apply Laplace smoothing
    for context in model:
        total_count = sum(model[context].values()) + len(model[context])
        for word in model[context]:
            model[context][word] = (model[context][word] + 1) / total_count

    return model

# Function to compute probability of a sequence
def compute_sequence_probability(model, sequence, n):
    padded_seq = ['<s>'] * (n - 1) + word_tokenize(sequence.lower()) + ['</s>']
    prob = 1.0

    for i in range(len(padded_seq) - n + 1):
        context = tuple(padded_seq[i:i + n - 1])
        word = padded_seq[i + n - 1]
        word_prob = model[context].get(word, 1 / (sum(model[context].values()) + 1))
        prob *= word_prob

    return prob

# Build unigram, bigram, trigram models
unigram_model = build_ngram_model(tokens, 1)
bigram_model = build_ngram_model(tokens, 2)
trigram_model = build_ngram_model(tokens, 3)

# Test sentence
test_sentence = "natural language processing"

# Compute probabilities
print(f"\nUnigram Prob: {compute_sequence_probability(unigram_model, test_sentence, 1)}")
print(f"Bigram Prob: {compute_sequence_probability(bigram_model, test_sentence, 2)}")
print(f"Trigram Prob: {compute_sequence_probability(trigram_model, test_sentence, 3)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Unigram Prob: 1.2323466028222122e-06
Bigram Prob: 0.16666666666666666
Trigram Prob: 0.25
