In [3]:
import nltk
from nltk.util import bigrams
from nltk import FreqDist
from collections import defaultdict

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Updated sentence (slightly longer)
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the sentence and convert to lowercase
words = nltk.word_tokenize(sentence.lower())

# Generate bigrams from the tokenized words
bigrams_list = list(bigrams(words))

# Calculate the frequency distribution of bigrams
bigram_freq = FreqDist(bigrams_list)

# Create vocabulary (set of unique words) and calculate its size (V)
vocabulary = set(words)
V = len(vocabulary)

# Calculate the frequency distribution of individual words
word_freq = FreqDist(words)

# Initialize defaultdict to store the smoothed probabilities
smoothed_probabilities = defaultdict(float)

# Apply Add-One Smoothing to calculate bigram probabilities
for bigram, count in bigram_freq.items():
    w1, w2 = bigram
    smoothed_prob = (count + 1) / (word_freq[w1] + V)
    smoothed_probabilities[bigram] = smoothed_prob

# Print the smoothed probabilities for each bigram
print("Bigram probabilities with Add-One Smoothing:")
for bigram, smoothed_prob in smoothed_probabilities.items():
    print(f"P({bigram[1]} | {bigram[0]}) = {smoothed_prob:.6f}")


Bigram probabilities with Add-One Smoothing:
P(quick | the) = 0.181818
P(brown | quick) = 0.200000
P(fox | brown) = 0.200000
P(jumps | fox) = 0.200000
P(over | jumps) = 0.200000
P(the | over) = 0.200000
P(lazy | the) = 0.181818
P(dog | lazy) = 0.200000
P(. | dog) = 0.200000


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
