Importing libraries

In [25]:
from nltk.corpus import treebank
import nltk
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from collections import defaultdict

# nltk.download('treebank')
# nltk.download('universal_tagset')

Preparing data

In [26]:
all_sents = []
for file in tqdm(treebank.fileids()):
    all_sents.extend(treebank.tagged_sents(file, tagset='universal'))
for sent in all_sents:
    sent.insert(0, ("###", "START"))
    sent.append(("&&&", "END"))
np.random.seed(42)
train_data, data = train_test_split(all_sents, train_size=0.8, shuffle=True)
test_data, valid_data = train_test_split(data, train_size=0.75, shuffle=False)
all_tags = set()
all_words = set()
for sent in train_data:
    for (word, tag) in sent:
        all_tags.add(tag)
        all_words.add(word)
all_tags = list(all_tags)
all_words = list(all_words)

100%|██████████| 199/199 [00:04<00:00, 46.23it/s]


Calculate emission probabilities

In [28]:
epsilon = 0.0001
default_dict = {word.lower(): epsilon for word in all_words}
emission_freq = defaultdict(lambda: default_dict.copy())
for sent in train_data:
    for (word, tag) in sent:
        word = word.lower()
        emission_freq[tag][word] += 1

default_dict = defaultdict(lambda: epsilon)
emission_prob = defaultdict(lambda: default_dict.copy())
for tag in emission_freq.keys():
    count = sum(emission_freq[tag].values())
    for word, freq in emission_freq[tag].items():
        emission_prob[tag][word] = freq / count

Calculate transition probabilities

In [29]:
default_dict = {tag: epsilon for tag in all_tags}
transition_freq = defaultdict(lambda: default_dict.copy())
for sent in train_data:
    sent_bigrams = list(nltk.bigrams(sent))
    for (word1, tag1), (word2, tag2) in sent_bigrams:
        transition_freq[tag1][tag2] += 1


transition_prob = defaultdict(dict)
for tag1 in transition_freq.keys():
    count = sum(transition_freq[tag1].values())
    for tag2, freq in transition_freq[tag1].items():
        transition_prob[tag1][tag2] = transition_freq[tag1][tag2] / count


Prepare test dataset

In [30]:
test_sents = []
test_true_tags = []
test_pred_tags = []

for sent in test_data:
    sent_words = []
    sent_tags = []
    del sent[0]
    del sent[-1]
    for (word, tag) in sent:
        sent_words.append(word.lower())
        sent_tags.append(tag)
    test_sents.append(sent_words)
    test_true_tags.append(sent_tags)

Viterbi algorithm

In [31]:
transition_prob['END'] = {tag: 0 for tag in all_tags}
epsilon = 0.00001
num_of_tags = len(all_tags)
for sent in tqdm(test_sents):
    viterbi = np.zeros((num_of_tags, len(sent))) + epsilon
    backpointer = np.zeros((num_of_tags, len(sent)))
    for i, tag in enumerate(all_tags):
        viterbi[i][0] = transition_prob['START'][tag] * emission_prob[tag][sent[0]]
    for t in range(1, len(sent)):
        for s, current_tag in enumerate(all_tags):
            viterbi_row = []
            backpointer_row = []
            for s_prime, prev_tag in enumerate(all_tags):
                viterbi_row.append(viterbi[s_prime][t - 1] * transition_prob[all_tags[s_prime]][all_tags[s]] * emission_prob[all_tags[s]][sent[t]])
                backpointer_row.append(viterbi[s_prime][t - 1] * transition_prob[all_tags[s_prime]][all_tags[s]])
            viterbi[s][t] = max(viterbi_row)
            backpointer[s][t] = np.argmax(backpointer_row)
    argmax = np.argmax(viterbi[:, -1])
    sent_pred_tags = []
    highest_idx = np.argmax(viterbi[:, -1])
    sent_pred_tags.insert(0, all_tags[highest_idx])
    for i in list(reversed(range(1, len(sent)))):
        highest_idx = int(backpointer[highest_idx][i])
        sent_pred_tags.insert(0, all_tags[highest_idx])
    test_pred_tags.append(sent_pred_tags)


correct_preds = 0
wrong_preds = 0
for true_tags, pred_tags in zip(test_true_tags, test_pred_tags):
    for true_tag, pred_tag in zip(true_tags, pred_tags):
        if true_tag == pred_tag:
            correct_preds += 1
        else:
            wrong_preds += 1
print('accuracy: ', float(correct_preds) / float(correct_preds + wrong_preds))

100%|██████████| 587/587 [00:11<00:00, 50.70it/s]

accuracy:  0.9338354652295305





In [39]:
print(test_true_tags[0])
print(test_pred_tags[0])

['ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'NOUN', '.', 'NOUN', 'VERB', '.', 'NUM', 'NUM', 'X', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'DET', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN', '.', 'CONJ', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'VERB', 'VERB', 'X', 'ADP', '.', 'NUM', 'NUM', 'X', 'ADP', 'ADJ', 'NUM', 'ADP', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', '.']
['ADP', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', '.', 'PRON', 'VERB', '.', 'NUM', 'NUM', 'X', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'DET', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN', '.', 'CONJ', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADJ', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'VERB', 'X', 'ADP', '.', 'NUM', 'NUM', 'X', 'ADP', 'ADJ', 'NUM', 'ADP', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', '.']


In [60]:
print(test_sents[0])

['for', 'the', 'agency', 'for', 'international', 'development', ',', 'appropriators', 'approved', '$', '200', 'million', '*u*', 'in', 'secondary', 'loan', 'guarantees', 'under', 'an', 'expanded', 'trade', 'credit', 'insurance', 'program', ',', 'and', 'total', 'loan', 'guarantees', 'for', 'the', 'overseas', 'private', 'investment', 'corp.', 'are', 'increased', '*-3', 'by', '$', '40', 'million', '*u*', 'over', 'fiscal', '1989', 'as', 'part', 'of', 'the', 'same', 'poland', 'package', '.']


In [67]:
print('appropriators' in all_words)
print('overseas' in all_words)
print('private' in all_words)
print(emission_freq['NOUN']['overseas'])
print(emission_freq['NOUN']['private'])

False
True
True
0.0001
0.0001
