In [4]:
# Alternate implementation of the same bigram model

# Training corpus
data = [
    ["<s>", "I", "love", "NLP", "</s>"],
    ["<s>", "I", "love", "deep", "learning", "</s>"],
    ["<s>", "deep", "learning", "is", "fun", "</s>"]
]

# 1. Count unigrams and bigrams
uni = {}
bi = {}

for sent in data:
    for token in sent:
        uni[token] = uni.get(token, 0) + 1

    for i in range(len(sent) - 1):
        pair = (sent[i], sent[i+1])
        bi[pair] = bi.get(pair, 0) + 1

print("Unigram Counts:")
for k, v in uni.items():
    print(k, ":", v)
print()

print("Bigram Counts:")
for k, v in bi.items():
    print(k, ":", v)
print()

# 2. Compute MLE bigram probabilities
prob = {}
for pair, cnt in bi.items():
    first = pair[0]
    prob[pair] = cnt / uni[first]

print("Bigram Probabilities:")
for k, v in prob.items():
    print(k, ":", round(v, 3))
print()

# 3. Sentence probability function
def sentence_p(sent):
    p = 1.0
    for i in range(len(sent) - 1):
        pair = (sent[i], sent[i+1])
        if pair not in prob:
            return 0.0
        p *= prob[pair]
    return p

# 4. Evaluate both sentences
s1 = ["<s>", "I", "love", "NLP", "</s>"]
s2 = ["<s>", "I", "love", "deep", "learning", "</s>"]

p_s1 = sentence_p(s1)
p_s2 = sentence_p(s2)

print("Sentence Probabilities:")
print("P(S1) =", p_s1)
print("P(S2) =", p_s2)
print()

if p_s1 > p_s2:
    print("Model prefers S1 because it has higher probability.")
else:
    print("Model prefers S2 because it has higher probability.")

Unigram Counts:
<s> : 3
I : 2
love : 2
NLP : 1
</s> : 3
deep : 2
learning : 2
is : 1
fun : 1

Bigram Counts:
('<s>', 'I') : 2
('I', 'love') : 2
('love', 'NLP') : 1
('NLP', '</s>') : 1
('love', 'deep') : 1
('deep', 'learning') : 2
('learning', '</s>') : 1
('<s>', 'deep') : 1
('learning', 'is') : 1
('is', 'fun') : 1
('fun', '</s>') : 1

Bigram Probabilities:
('<s>', 'I') : 0.667
('I', 'love') : 1.0
('love', 'NLP') : 0.5
('NLP', '</s>') : 1.0
('love', 'deep') : 0.5
('deep', 'learning') : 1.0
('learning', '</s>') : 0.5
('<s>', 'deep') : 0.333
('learning', 'is') : 0.5
('is', 'fun') : 1.0
('fun', '</s>') : 1.0

Sentence Probabilities:
P(S1) = 0.3333333333333333
P(S2) = 0.16666666666666666

Model prefers S1 because it has higher probability.
