In [2]:
import nltk
from nltk.corpus import treebank
from nltk import FreqDist
from collections import defaultdict

nltk.download('treebank')
nltk.download('universal_tagset')

# Load the Treebank corpus with POS-tagged sentences
train_data = treebank.tagged_sents()

# Initialize dictionaries to store counts for transitions (tag[i] -> tag[i+1]) and emissions (word -> tag)
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))

# Initialize dictionaries to store probabilities for transitions and emissions
transition_probs = defaultdict(lambda: defaultdict(float))
emission_probs = defaultdict(lambda: defaultdict(float))

# Count transitions and emissions in the training data
for sentence in train_data:
    # Get the POS tags and words in the sentence
    tags = [tag for _, tag in sentence]  # List of POS tags in the sentence
    words = [word for word, _ in sentence]  # List of words in the sentence

    # Count transitions: tag[i] -> tag[i+1] (POS tag transitions)
    for i in range(len(tags) - 1):
        transition_counts[tags[i]][tags[i + 1]] += 1  # Increment transition count from tag[i] to tag[i+1]

    # Count emissions: word -> tag (word to POS tag associations)
    for word, tag in zip(words, tags):
        emission_counts[tag][word] += 1  # Increment emission count for word and its associated tag

# Calculate transition probabilities using Add-One smoothing
for tag1 in transition_counts:
    total_tag1 = sum(transition_counts[tag1].values())  # Total transitions from tag1
    for tag2 in transition_counts[tag1]:
        # Apply Add-One smoothing to compute transition probability P(tag2 | tag1)
        transition_probs[tag1][tag2] = (transition_counts[tag1][tag2] + 1) / (total_tag1 + len(transition_counts))

# Calculate emission probabilities using Add-One smoothing
for tag in emission_counts:
    total_tag = sum(emission_counts[tag].values())  # Total words for each tag
    for word in emission_counts[tag]:
        # Apply Add-One smoothing to compute emission probability P(word | tag)
        emission_probs[tag][word] = (emission_counts[tag][word] + 1) / (total_tag + len(emission_counts))

# Display transition probabilities
print("Transition Probabilities (Add-One Smoothing):")
for tag1 in transition_probs:
    for tag2 in transition_probs[tag1]:
        # Print the calculated transition probabilities for each POS tag pair
        print(f"P({tag2} | {tag1}) = {transition_probs[tag1][tag2]:.6f}")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Transition Probabilities (Add-One Smoothing):
P(NNP | NNP) = 0.380781
P(, | NNP) = 0.152609
P(CD | NNP) = 0.020214
P(VBZ | NNP) = 0.036618
P(VBG | NNP) = 0.000847
P(NN | NNP) = 0.055244
P(WDT | NNP) = 0.000635
P(NNS | NNP) = 0.022648
P(IN | NNP) = 0.043497
P(CC | NNP) = 0.038417
P(POS | NNP) = 0.050058
P(. | NNP) = 0.050376
P(VBD | NNP) = 0.064557
P(MD | NNP) = 0.010266
P(TO | NNP) = 0.004233
P(VBP | NNP) = 0.003916
P(: | NNP) = 0.007197
P(RB | NNP) = 0.007726
P(JJ | NNP) = 0.008890
P('' | NNP) = 0.003387
P(-NONE- | NNP) = 0.005715
P(NNPS | NNP) = 0.017251
P(DT | NNP) = 0.002328
P(JJR | NNP) = 0.000212
P(VBN | NNP) = 0.000847
P(-RRB- | NNP) = 0.003387
P(-LRB- | NNP) = 0.002752
P($ | NNP) = 0.000317
P(WP | NNP) = 0.000741
P(PRP | NNP) = 0.000635
P(VB | NNP) = 0.000952
P(`` | NNP) = 0.000847
P(RP | NNP) = 0.000212
P(SYM | NNP) = 0.000212
P(WRB | NNP) = 0.000317
P(CD | ,) = 0.023529
P(MD | ,) = 0.011156
P(DT | ,) = 0.134077
P(VBD | ,) = 0.054564
P(NNS | ,) = 0.026369
P(NN | ,) = 0.048276
