In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter

In [2]:
tagged_sentence = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
tagged_sentence[:2]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Vinken', 'NOUN'),
  ('is', 'VERB'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Elsevier', 'NOUN'),
  ('N.V.', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Dutch', 'NOUN'),
  ('publishing', 'VERB'),
  ('group', 'NOUN'),
  ('.', '.')]]

In [4]:
print("Total Number of Tagged sentences", len(tagged_sentence))

Total Number of Tagged sentences 3914


In [5]:
train_set,test_set =train_test_split(tagged_sentence,train_size=0.80,random_state=101)

In [6]:
train_set

[[('Drink', 'NOUN'),
  ('Carrier', 'NOUN'),
  ('Competes', 'VERB'),
  ('With', 'ADP'),
  ('Cartons', 'NOUN')],
 [('At', 'ADP'),
  ('last', 'ADJ'),
  ('count', 'NOUN'),
  (',', '.'),
  ('Candela', 'NOUN'),
  ('had', 'VERB'),
  ('sold', 'VERB'),
  ('$', '.'),
  ('4', 'NUM'),
  ('million', 'NUM'),
  ('*U*', 'X'),
  ('of', 'ADP'),
  ('its', 'PRON'),
  ('medical', 'ADJ'),
  ('devices', 'NOUN'),
  ('in', 'ADP'),
  ('Japan', 'NOUN'),
  ('.', '.')],
 [('Mrs.', 'NOUN'),
  ('Hills', 'NOUN'),
  ('lauded', 'VERB'),
  ('South', 'NOUN'),
  ('Korea', 'NOUN'),
  ('for', 'ADP'),
  ('*-1', 'X'),
  ('creating', 'VERB'),
  ('an', 'DET'),
  ('intellectual-property', 'ADJ'),
  ('task', 'NOUN'),
  ('force', 'NOUN'),
  ('and', 'CONJ'),
  ('special', 'ADJ'),
  ('enforcement', 'NOUN'),
  ('teams', 'NOUN'),
  ('of', 'ADP'),
  ('police', 'NOUN'),
  ('officers', 'NOUN'),
  ('and', 'CONJ'),
  ('prosecutors', 'NOUN'),
  ('trained', 'VERB'),
  ('*', 'X'),
  ('to', 'PRT'),
  ('pursue', 'VERB'),
  ('movie', 'NOUN'),
  

In [7]:
train_words=[tup for data in train_set for tup in data]
test_words=[tup for data in test_set for tup in data]

In [8]:
vocab={word for word,tag in train_words}
print("Vocabulary of the Corpus",len(vocab))
tags={tag for word,tag in train_words}
print("Number of Tags in the Corpus ",len(tags))

Vocabulary of the Corpus 11052
Number of Tags in the Corpus  12


In [9]:
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  3131
Number of Sentences in Testing Data  783


In [10]:
def emission_prob(word, tag, train_bag = train_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)

    
    return (count_w_given_tag, count_tag)

In [11]:
def transition_prob(t2, t1, train_bag = train_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [13]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = transition_prob(t2, t1)[0]/transition_prob(t2, t1)[1]

print(tags_matrix)

[[6.83371304e-03 5.01138950e-03 6.83371304e-03 3.69020514e-02
  4.19134386e-02 9.56719834e-03 4.84738052e-01 1.41230067e-02
  7.06150308e-02 8.83826911e-02 2.12756261e-01 2.23234631e-02]
 [6.03732169e-02 5.48847427e-04 4.06147093e-02 5.70801310e-02
  3.51262353e-02 1.23490669e-01 1.50384188e-01 4.39077942e-03
  1.13611415e-01 9.33040585e-03 3.49066973e-01 5.59824370e-02]
 [1.42806140e-03 1.42806144e-02 1.84219927e-01 3.57015361e-03
  1.19243130e-01 3.57015361e-03 2.07068902e-02 2.60621198e-02
  3.53445187e-02 2.02427700e-01 3.51660132e-01 3.74866128e-02]
 [1.20248254e-02 6.98215654e-03 2.98681147e-02 8.14584941e-02
  1.39255241e-01 7.13731572e-02 3.39022487e-01 1.47401085e-02
  1.30721495e-01 2.28859577e-02 3.21955010e-02 1.19472459e-01]
 [6.87694475e-02 6.00793920e-02 7.82104954e-02 5.25694676e-02
  9.23720598e-02 1.72191828e-01 8.96899477e-02 2.78940029e-03
  4.61323895e-02 2.56410260e-02 2.18538776e-01 9.29084867e-02]
 [3.30602261e-03 4.31220367e-04 2.28546783e-02 1.20741697e-02
  1

In [14]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,PRON,CONJ,NUM,ADV,.,DET,VERB,PRT,ADJ,X,NOUN,ADP
PRON,0.006834,0.005011,0.006834,0.036902,0.041913,0.009567,0.484738,0.014123,0.070615,0.088383,0.212756,0.022323
CONJ,0.060373,0.000549,0.040615,0.05708,0.035126,0.123491,0.150384,0.004391,0.113611,0.00933,0.349067,0.055982
NUM,0.001428,0.014281,0.18422,0.00357,0.119243,0.00357,0.020707,0.026062,0.035345,0.202428,0.35166,0.037487
ADV,0.012025,0.006982,0.029868,0.081458,0.139255,0.071373,0.339022,0.01474,0.130721,0.022886,0.032196,0.119472
.,0.068769,0.060079,0.07821,0.052569,0.092372,0.172192,0.08969,0.002789,0.046132,0.025641,0.218539,0.092908
DET,0.003306,0.000431,0.022855,0.012074,0.017393,0.006037,0.040247,0.000287,0.206411,0.045134,0.635906,0.009918
VERB,0.035543,0.005433,0.022836,0.083886,0.034807,0.13361,0.167956,0.030663,0.06639,0.21593,0.110589,0.092357
PRT,0.017613,0.002348,0.056751,0.009393,0.04501,0.10137,0.401174,0.001174,0.082975,0.012133,0.250489,0.019569
ADJ,0.000194,0.016893,0.021748,0.005243,0.066019,0.005243,0.011456,0.011456,0.063301,0.020971,0.696893,0.080583
X,0.0542,0.010379,0.003075,0.025754,0.160869,0.05689,0.206419,0.185086,0.017682,0.075726,0.061695,0.142226


In [26]:
def viterbi(words, train_bag = train_words):
    state = []
    tags_unique = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in tags_unique:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = emission_prob(words[key], tag)[0]/emission_prob(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = tags_unique[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [27]:
random.seed(1234)      #define a random seed to get same sentences when run multiple times

# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]

# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [28]:
tagged_seq = viterbi(test_tagged_words)

# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Viterbi Algorithm Accuracy:  93.77990430622009
