In [1]:
import numpy as np
import nltk
from nltk.corpus import treebank, brown
import pdb

In [2]:
corpus = brown.tagged_sents(tagset='universal')[:-100] 
test_corpus = brown.tagged_sents(tagset='universal')[-100:]

tag_dict = {}
word_dict = {}

for sent in corpus:
    for elem in sent:
        w = elem[0]
        tag = elem[1]

        if w not in word_dict:
            word_dict[w]= 0

        if tag not in tag_dict:
            tag_dict[tag] = 0

        word_dict[w] += 1
        tag_dict[tag] += 1

print(len(word_dict))
print(len(tag_dict))
        
test_data = brown.tagged_sents(tagset='universal')[-10:]

print(len(test_data))

k = 0.001

55907
12
10


In [3]:
# Building Start Dict

ind_tag = []
tag_ind = {}
start_matrix = {}

for tag in tag_dict:
    ind_tag += [tag]

for ind, tag in enumerate(ind_tag):
    tag_ind[tag] = ind

start_tag_counts = {}
total_sentences = 0

for sent in corpus:
    for elem in sent:
        start_tag_counts[elem[1]] = start_tag_counts.get(elem[1], 0) + 1
        break
    total_sentences += 1

for tag in tag_ind:
    start_matrix[tag] = start_tag_counts[tag] / total_sentences
start_matrix

{'.': 0.08901118099231307,
 'ADJ': 0.0343466107617051,
 'ADP': 0.12283368273934311,
 'ADV': 0.09117749825296995,
 'CONJ': 0.04916142557651992,
 'DET': 0.21339972047519218,
 'NOUN': 0.14129979035639412,
 'NUM': 0.016788958770090845,
 'PRON': 0.15971348707197763,
 'PRT': 0.03665269042627533,
 'VERB': 0.045090845562543676,
 'X': 0.0005241090146750524}

In [4]:
# Building Emission Matrix

tag_word_counts = {}
emission_matrix = {}

for sent in corpus:
    for elem in sent:
        tag_word_counts[elem[1]] = tag_word_counts.get(elem[1], {})
        tag_word_counts[elem[1]][elem[0]] = tag_word_counts[elem[1]].get(elem[0], 0)
        tag_word_counts[elem[1]][elem[0]] += 1

for tag in tag_dict:
    emission_matrix[tag] = {}
    for word in word_dict:
        emission_matrix[tag][word] = (tag_word_counts[tag].get(word, 0) + k) / (k * len(word_dict) + tag_dict[tag])
emission_matrix['NOUN']['tent']

7.2696303799849e-05

In [5]:
# Building Transition Matrix

tag_tag_count = {}
transition_matrix = {}

for sent in corpus:
    for i in range(len(sent) - 1):
        tag_tag_count[sent[i][1]] = tag_tag_count.get(sent[i][1], {})
        tag_tag_count[sent[i][1]][sent[i + 1][1]] = tag_tag_count[sent[i][1]].get(sent[i + 1][1], 0)
        tag_tag_count[sent[i][1]][sent[i + 1][1]] += 1

for tag1 in tag_dict:
    transition_matrix[tag1] = {}
    for tag2 in tag_dict:
        transition_matrix[tag1][tag2] = (tag_tag_count[tag1].get(tag2, 0) + k) / (k * len(tag_dict) + sum(tag_tag_count[tag1].values()))

sum(transition_matrix['.'].values())

1.0

In [None]:
# Viterbi Algorithm

def viterbi(sent):
    viterbi_dp = {}
    viterbi_dp[0] = {}
    backpointer = {}
    backpointer[0] = {}
    
    for tag in tag_dict:
        viterbi_dp[0][tag] = start_matrix[tag] * emission_matrix[tag].get(sent[0][0], k / float(k * len(word_dict) + tag_dict[tag]))
        backpointer[0][tag] = -1

    for i, elem in enumerate(sent[1:], 1):
        viterbi_dp[i] = {}
        backpointer[i] = {}
        for tag in tag_dict:
            viterbi_dp[i][tag] = emission_matrix[tag].get(elem[0], k / float(k * len(word_dict) + tag_dict[tag])) * (max(transition_matrix[tag_before][tag] * viterbi_dp[i - 1][tag_before] for tag_before in tag_dict))
            backpointer[i][tag] = max([[transition_matrix[tag_before][tag] * viterbi_dp[i - 1][tag_before], tag_before] for tag_before in tag_dict], key=lambda k: k[0])[1]
    path, i = [], len(sent) - 1
    while i != -1:
        path_elem = max(backpointer[i], key=lambda k: backpointer[i][k])
        pdb.set_trace()
        path += [path_elem]
        i -= 1
    path.reverse()
    print(path)
    return path

total = 0
total_correct = 0

for sent in test_corpus:
    pred = viterbi(sent)
    print(pred)
    total_correct = sum([x == y[1] for x, y in zip(pred, sent)])
    total += len(sent)

print(total_correct / total * 100)

> <ipython-input-9-57fd79795455>(23)viterbi()
-> path += [path_elem]
(Pdb) backpointer[48]
{'NUM': 'NOUN', 'NOUN': 'NOUN', 'ADP': 'NOUN', 'ADV': 'NOUN', '.': 'NOUN', 'PRON': 'NOUN', 'DET': 'NOUN', 'CONJ': 'NOUN', 'PRT': 'NOUN', 'ADJ': 'NOUN', 'VERB': 'NOUN', 'X': 'NOUN'}
(Pdb) viterbi_dp[48]
{'NUM': 9.904725719893204e-150, 'NOUN': 9.923302218817908e-150, 'ADP': 3.090798401487707e-149, 'ADV': 8.582428298276205e-150, '.': 1.7339057924548746e-141, 'PRON': 7.34638375812877e-150, 'DET': 2.071757897329409e-150, 'CONJ': 2.861708341942538e-149, 'PRT': 1.0931030354502221e-149, 'ADJ': 2.8184306720389313e-150, 'VERB': 1.5923347637040423e-149, 'X': 4.244276354143109e-150}
