# Unsmoothed and Smoothed Bigram Model

In [1]:
from collections import defaultdict

## Defining Training Corpus

In [2]:
# Training corpus
training_corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]

## Tokenize Training Corpus

In [3]:
# Tokenize the training corpus into words
tokenized_corpus = [sentence.split() for sentence in training_corpus]

## Unsmoothed Bigram Model

In [4]:
# Build unsmoothed bigram model
unsmoothed_bigram_model = defaultdict(lambda: defaultdict(int))
for sentence in tokenized_corpus:
    for i in range(len(sentence) - 1):
        unsmoothed_bigram_model[sentence[i]][sentence[i+1]] += 1

In [5]:
# Calculate unsmoothed bigram probabilities
unsmoothed_bigram_probabilities = {
    word: {next_word: count / sum(next_words.values()) for next_word, count in next_words.items()}
    for word, next_words in unsmoothed_bigram_model.items()
}

In [6]:
# Calculate sentence probability using unsmoothed bigram model
sentence = "<s> I read a book by Danielle </s>"

In [7]:
tokenized_sentence = sentence.split()

In [8]:
unsmoothed_probability = 1.0

In [9]:
for i in range(len(tokenized_sentence) - 1):
    current_word = tokenized_sentence[i]
    next_word = tokenized_sentence[i + 1]
    unsmoothed_probability *= unsmoothed_bigram_probabilities[current_word].get(next_word, 0)

## Smoothed Bigram Model

In [10]:
# Build smoothed bigram model using Laplace smoothing
V = len(set(word for sentence in tokenized_corpus for word in sentence))  # Vocabulary size

In [11]:
smoothed_bigram_model = defaultdict(lambda: defaultdict(float))

In [12]:
for word, next_words in unsmoothed_bigram_model.items():
    total_count = sum(next_words.values()) + V  # Add V for Laplace smoothing
    for next_word in next_words:
        smoothed_bigram_model[word][next_word] = (next_words[next_word] + 1) / total_count

In [13]:
# Calculate sentence probability using smoothed bigram model
smoothed_probability = 1.0

In [14]:
for i in range(len(tokenized_sentence) - 1):
    current_word = tokenized_sentence[i]
    next_word = tokenized_sentence[i + 1]
    smoothed_probability *= smoothed_bigram_model[current_word].get(next_word, 0)

## Displaying the Probability

In [15]:
# Print the sentence probabilities
print("Unsmoothed Bigram Model Probability:", unsmoothed_probability)
print("Smoothed Bigram Model Probability:", smoothed_probability)

Unsmoothed Bigram Model Probability: 0.07407407407407407
Smoothed Bigram Model Probability: 1.0101357919757919e-05
