## Loading Corpus from NLTK Data - Brown Corpus

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [19]:
from nltk.corpus import brown

In [20]:
brown.raw()[:1000]

"\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np A

In [21]:
brown.tagged_words()[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

## Probability Calculations

In [22]:
tagged_corpus = brown.tagged_words(tagset="universal")
tagged_corpus

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [33]:
from collections import defaultdict, deque

In [82]:
START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'

def prob_calc(tagged_corpus):
    transition_prob = {}
    emission_prob = {}
    
    tag_trigram_counts = defaultdict(lambda: 0)
    tag_bigram_counts = defaultdict(lambda: 0)
    tag_unigram_counts = defaultdict(lambda: 0)
    word_tag_counts = defaultdict(lambda: 0)
    
    for i in range(len(tagged_corpus)):
        
        unigram_key = tagged_corpus[i][1]
        tag_unigram_counts[unigram_key] += 1
        
        if i < 1:
            bigram = (START_SYMBOL,tagged_corpus[i][1])
            bigram_key = bigram
            tag_bigram_counts[bigram_key] += 1
            
            trigram = (START_SYMBOL,START_SYMBOL,tagged_corpus[i][1])
            trigram_key = trigram
            tag_trigram_counts[trigram_key] += 1
            
            bigram = (START_SYMBOL,START_SYMBOL)
            bigram_key = bigram
            tag_bigram_counts[bigram_key] += 1
            continue
            
        bigram = (tagged_corpus[i-1][1],tagged_corpus[i][1])
        bigram_key = bigram
        tag_bigram_counts[bigram_key] += 1
        
        word_tag = (tagged_corpus[i-1][0].lower(),tagged_corpus[i][0].lower(),tagged_corpus[i][1])
        word_tag = (tagged_corpus[i][0].lower(),tagged_corpus[i][1])
        word_tag_key = word_tag
        word_tag_counts[word_tag_key] += 1
        
        if i < 2:
            trigram = (START_SYMBOL,tagged_corpus[i-1][1],tagged_corpus[i][1])
            trigram_key = trigram
            tag_trigram_counts[trigram_key] += 1
            continue
            
        trigram = (tagged_corpus[i-2][1],tagged_corpus[i-1][1],tagged_corpus[i][1])
        trigram_key = trigram
        tag_trigram_counts[trigram_key] += 1
    
    tag_bigram_counts[(START_SYMBOL,START_SYMBOL)] = 1
    
    for tri in tag_trigram_counts:
        transition_key = (tri[0],tri[1],tri[2])
#         print(tag_bigram_counts[(tri[0],tri[1])],(tri[0],tri[1]))
        transition_prob[transition_key] = tag_trigram_counts[tri]/(tag_bigram_counts[(tri[0],tri[1])])
    
    for word_tag_pair in word_tag_counts:
        word_tag = word_tag_pair
        emission_key = word_tag
#         print(word_tag[-1],tag_unigram_counts[word_tag[-1]])
        emission_prob[emission_key] = word_tag_counts[word_tag_pair]/tag_unigram_counts[word_tag[-1]]
        
    return transition_prob, emission_prob, tag_unigram_counts, tag_bigram_counts, tag_trigram_counts

In [83]:
transition_prob, emission_prob, tag_unigram_counts, tag_bigram_counts, tag_trigram_counts = prob_calc(tagged_corpus)

In [84]:
emission_prob

{('elliott', 'NOUN'): 1.8145000326610006e-05,
 ('scotland', 'NOUN'): 4.7177000849186016e-05,
 ('defects', 'NOUN'): 4.7177000849186016e-05,
 ('pooling', 'VERB'): 1.094391244870041e-05,
 ('lighthearted', 'ADJ'): 2.3888868981498072e-05,
 ('sarcastically', 'ADV'): 1.778125500097797e-05,
 ('golf', 'NOUN'): 0.00012338600222094805,
 ("thom's", 'NOUN'): 3.6290000653220013e-06,
 ('multi-colored', 'ADJ'): 1.1944434490749036e-05,
 ('prohibiting', 'VERB'): 2.188782489740082e-05,
 ('baltimore', 'NOUN'): 7.983800143708403e-05,
 ('raised', 'VERB'): 0.0005526675786593707,
 ('deployed', 'VERB'): 1.6415868673050617e-05,
 ('unready', 'ADJ'): 1.1944434490749036e-05,
 ('laps', 'VERB'): 5.471956224350205e-06,
 ('splits', 'NOUN'): 3.6290000653220013e-06,
 ('larry', 'NOUN'): 3.266100058789801e-05,
 ('5,014', 'NUM'): 6.72314105149926e-05,
 ('squat', 'NOUN'): 1.4516000261288005e-05,
 ('sextet', 'NOUN'): 1.4516000261288005e-05,
 ('rebuilds', 'VERB'): 5.471956224350205e-06,
 ('nigh', 'ADV'): 1.778125500097797e-05

In [27]:
sent_tagged_corpus = brown.tagged_sents()

## Viterbi Algorithm 

In [28]:
tag_unigram_counts

defaultdict(<function __main__.prob_calc.<locals>.<lambda>>,
            {'.': 147565,
             'ADJ': 83721,
             'ADP': 144766,
             'ADV': 56239,
             'CONJ': 38151,
             'DET': 137019,
             'NOUN': 275558,
             'NUM': 14874,
             'PRON': 49334,
             'PRT': 29829,
             'VERB': 182750,
             'X': 1386})

In [87]:
START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'
LOG_PROB_OF_ZERO = -1000

tagset = list(tag_unigram_counts.keys())

def S(k):
    if k == -1 or k == 0:
        return START_SYMBOL
    
    else:
        return tagset    

def viterbi(sentence,transition_prob,emission_prob):
    pi = defaultdict(float)
    bp = {}
    
    pi[(0,START_SYMBOL,STOP_SYMBOL)] = 0.0
    pi[(0,START_SYMBOL,START_SYMBOL)] = 1.0
    
    n = len(sentence)
    sent_words = sentence
    
    for k in range(1,n+1):
        for u in S(k-1):
            for v in S(k):
                max_score = float('-inf')
                best_tag = None
                
                for w in S(k - 2):
                    if  emission_prob.get((sent_words[k-1], v), 0) != 0:
                        score = pi.get((k-1, w, u), LOG_PROB_OF_ZERO) + transition_prob.get((w, u, v), LOG_PROB_OF_ZERO) + emission_prob[(sent_words[k-1],v)]
                        
                        if score > max_score:
                            max_score = score
                            best_tag = w
                    
                pi[(k,u,v)] = max_score
                bp[(k,u,v)] = best_tag
                
#     return pi,bp

    max_score = float('-Inf')
    u_max, v_max = None, None
    for u in S(n-1):
        for v in S(n):
            score = pi.get((n, u, v), LOG_PROB_OF_ZERO) + transition_prob.get((u, v, STOP_SYMBOL), LOG_PROB_OF_ZERO)
            if score > max_score:
                max_score = score
                u_max = u
                v_max = v

    tags = deque()
    tags.append(v_max)
    tags.append(u_max)
    tagged = []

    for i, k in enumerate(range(n-2, 0, -1)):
        tags.append(bp[(k+2, tags[i+1], tags[i])])
    tags.reverse()

    tagged_sentence = deque()
    for j in range(0, n):
        tagged_sentence.append(sentence[j] + '/' + tags[j])
    tagged_sentence.append('\n')
    tagged.append(' '.join(tagged_sentence))

    return tagged

In [88]:
test = ['wow','this','actually','works','.']

In [89]:
viterbi(test,transition_prob,emission_prob)

['wow/PRT this/ADV actually/ADV works/VERB ./. \n']

## Tops Words Per Tag

In [37]:
def split_postags(emission_prob):
    top = defaultdict(lambda: {})
    
    for key in emission_prob:
        top[key[-1]][key[0]] = emission_prob[key]
        
    return top

In [38]:
split = split_postags(emission_prob)

In [39]:
def top50(split):
    for postag in split:
        temp = split[postag]
        s = [(k, temp[k]) for k in sorted(temp, key=temp.get, reverse=True)]
#         print(temp)
#         print(s)

        top50keys = []
        for pair in s:
            top50keys.append(pair[0])

        print("For POS Tag,\t",postag,'\n the top words are:',top50keys[:50],'\nSorted from most frequent to least frequent')
        print('\n')
#         break
    
    return

In [40]:
top50(split)

For POS Tag,	 NOUN 
 the top words are: ['time', 'man', 'af', 'years', 'way', 'people', 'mr.', 'world', 'state', 'men', 'life', 'day', 'year', 'states', 'work', 'house', 'one', 'home', 'mrs.', 'part', 'place', 'school', 'number', 'course', 'war', 'something', 'fact', 'water', 'hand', 'government', 'system', 'nothing', 'night', 'head', 'eyes', 'city', 'business', 'program', 'group', 'days', 'room', 'president', 'side', 'end', 'point', 'things', 'john', 'use', 'case', 'order'] 
Sorted from most frequent to least frequent


For POS Tag,	 . 
 the top words are: [',', '.', '``', "''", ';', '?', '--', ')', '(', ':', '!', "'", '[', ']'] 
Sorted from most frequent to least frequent


For POS Tag,	 NUM 
 the top words are: ['one', 'two', 'three', '1', '2', 'four', 'five', '3', 'six', '4', 'million', '1960', 'hundred', 'ten', '10', '5', '1961', '6', 'seven', '15', '30', '8', 'eight', '1959', 'thousand', '12', '20', '1958', '7', '25', 'nine', 'twenty', '100', 'fifty', '9', 'billion', '11', '50', 

## Code Evaluation

In [92]:
from tqdm import tqdm_notebook as tqdm

In [41]:
validation = brown.tagged_sents(tagset='universal')

In [93]:
sent_preds = []

for sent in tqdm(brown.sents()):
    s = []
    for i in sent:
        s.append(i.lower())
        
    pred = viterbi(s,transition_prob,emission_prob)
#     print(pred)
    pred = pred[0].strip().split(' ')
    new_pred = []
    
    for i in pred:
        new_pred.append(i.split('/'))
        
    sent_preds.append(new_pred)


  2%|▏         | 1416/57340 [00:43<13:24, 69.50it/s][A





Exception ignored in: <bound method tqdm.__del__ of   2%|▏         | 1416/57340 [15:46<13:24, 69.50it/s]>
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 889, in __del__
    self.close()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 1095, in close
    self._decr_instances(self)
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 454, in _decr_instances
    cls.monitor.exit()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_monitor.py", line 52, in exit
    self.join()
  File "/usr/lib/python3.5/threading.py", line 1051, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread


In [64]:
len(validation)

57340

In [97]:
len(sent_preds)

57340

In [94]:
cor = 0
incor = 0
total = 0

for i in range(len(validation)):
    if (len(validation[i]) == len(sent_preds[i])):
        s1 = validation[i]
        s2 = sent_preds[i]
        
        for j in range(len(s1)):
            if s1[j][1] == s2[j][1]:
                cor += 1
            else:
                incor += 1
#                 print('word: ',s1[j][0],' pred_tag: ',s2[j][1],' act_tag: ',s1[j][1])

In [95]:
acc = cor/(cor+incor)

In [96]:
acc

0.9134165581574796