## POS Tagging for  English Penn Treebank (PTB) datset

In [29]:
#import utilities
import numpy as np # linear algebra
import pandas as pd # data processing
from urllib.request import urlopen
import re
import json
from sklearn.model_selection import train_test_split

In [30]:
with open("penn-data.json") as file:
    data = json.load(file)

In [31]:
print("DATA Size: ",len(data))
print(data[0])

DATA Size:  3914
['Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.', ['NNP', 'NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD']]
DATA Size:  3914
['Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.', ['NNP', 'NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD']]


In [32]:
#Data preprocessing
pairs=[]
for sentence,labels in data:
    sentence=re.sub(r"[,.-]","",sentence).lower().split()
#     labels = labels.remove(',','-',':','-RRB-',"''",'#')
    pairs.append([])
    for i in range(len(sentence)):
        if len(sentence)==len(labels):
            pairs[-1].append([sentence[i],labels[i]])

In [33]:
print("Size: ", len(pairs)),pairs[0]

Size:  3914


(None,
 [['pierre', 'NNP'],
  ['vinken', 'NNP'],
  ['61', 'CD'],
  ['years', 'NNS'],
  ['old', 'JJ'],
  ['will', 'MD'],
  ['join', 'VB'],
  ['the', 'DT'],
  ['board', 'NN'],
  ['as', 'IN'],
  ['a', 'DT'],
  ['nonexecutive', 'JJ'],
  ['director', 'NN'],
  ['nov', 'NNP'],
  ['29', 'CD']])

Size:  3914


(None,
 [['pierre', 'NNP'],
  ['vinken', 'NNP'],
  ['61', 'CD'],
  ['years', 'NNS'],
  ['old', 'JJ'],
  ['will', 'MD'],
  ['join', 'VB'],
  ['the', 'DT'],
  ['board', 'NN'],
  ['as', 'IN'],
  ['a', 'DT'],
  ['nonexecutive', 'JJ'],
  ['director', 'NN'],
  ['nov', 'NNP'],
  ['29', 'CD']])

In [34]:
#split dataset into train test
splitsize = int(0.8*len(pairs))

train_data = pairs[:splitsize]
test_data = pairs[splitsize:]

In [35]:
#check unique words and unique tags in dataset
utags = set()
uwords = set()
for i in train_data:
    for w,t in i:
        uwords.add(w)
        utags.add(t)
print("No. of unique tags = ", len(utags))
print("Tags : ", utags)
print("NO. of unique words: ", len(uwords))

No. of unique tags =  41
Tags :  {'DT', 'PRP$', 'VBN', 'JJ', 'MD', 'NNPS', 'NNP', 'NN', 'IN', 'JJS', 'RBS', 'VBD', 'VB', 'VBZ', 'WP', 'WRB', 'PRP', '-LRB-', ':', 'RB', 'PDT', 'CD', 'UH', 'WP$', "''", 'RBR', '-RRB-', 'EX', 'RP', 'FW', 'LS', 'VBG', 'TO', 'SYM', 'NNS', 'WDT', 'CC', ',', '#', 'VBP', 'JJR'}
NO. of unique words:  10348
No. of unique tags =  41
Tags :  {'DT', 'PRP$', 'VBN', 'JJ', 'MD', 'NNPS', 'NNP', 'NN', 'IN', 'JJS', 'RBS', 'VBD', 'VB', 'VBZ', 'WP', 'WRB', 'PRP', '-LRB-', ':', 'RB', 'PDT', 'CD', 'UH', 'WP$', "''", 'RBR', '-RRB-', 'EX', 'RP', 'FW', 'LS', 'VBG', 'TO', 'SYM', 'NNS', 'WDT', 'CC', ',', '#', 'VBP', 'JJR'}
NO. of unique words:  10348


In [36]:
unwanted_tags = [',',"''",':','#','-RRB-']
for t in unwanted_tags:
    if t in utags:
        utags.remove(t)
        
print("No. of unique tags = ", len(utags))
print("Tags : ", utags)

No. of unique tags =  36
Tags :  {'DT', 'PRP$', 'VBN', 'JJ', 'MD', 'NNPS', 'NNP', 'NN', 'IN', 'JJS', 'RBS', 'VBD', 'VB', 'VBZ', 'WP', 'WRB', 'PRP', '-LRB-', 'RB', 'PDT', 'CD', 'UH', 'WP$', 'RBR', 'EX', 'RP', 'FW', 'LS', 'VBG', 'TO', 'SYM', 'NNS', 'WDT', 'CC', 'VBP', 'JJR'}
No. of unique tags =  36
Tags :  {'DT', 'PRP$', 'VBN', 'JJ', 'MD', 'NNPS', 'NNP', 'NN', 'IN', 'JJS', 'RBS', 'VBD', 'VB', 'VBZ', 'WP', 'WRB', 'PRP', '-LRB-', 'RB', 'PDT', 'CD', 'UH', 'WP$', 'RBR', 'EX', 'RP', 'FW', 'LS', 'VBG', 'TO', 'SYM', 'NNS', 'WDT', 'CC', 'VBP', 'JJR'}


In [37]:
tagsdict={i:ind for ind,i in enumerate(utags)}
wordsdict={i:ind for ind,i in enumerate(uwords)}

In [38]:
#Generate Inititial , transition and emmision probablity matrxes
nw = len(uwords)
nt = len(utags)
prev=None
next=None
pi=np.ones((nt,1),dtype='float64') # pi initial probabilities
TMatrix=np.ones((nt,nt),dtype='float64') # Transition matrix
EMatrix=np.ones((nt,nw),dtype='float64') # Emmision matrix
for i in train_data:
    for ind,[word,tag] in enumerate(i):
        if tag not in utags:
            continue
        if ind==0:
            pi[tagsdict[tag]][0]+=1
            prev=tag
            continue
        next=tag
        TMatrix[tagsdict[prev]][tagsdict[next]]+=1
        EMatrix[tagsdict[tag]][wordsdict[word]]+=1
        prev=tag
        
pi=pi/pi.sum(axis=1)[:,np.newaxis]
TMatrix=TMatrix/TMatrix.sum(axis=1)[:,np.newaxis]
EMatrix=EMatrix/EMatrix.sum(axis=1)[:,np.newaxis]

In [39]:
pi.shape,TMatrix.shape, EMatrix.shape

((36, 1), (36, 36), (36, 10348))

((36, 1), (36, 36), (36, 10348))

In [40]:
transmatrix = pd.DataFrame(TMatrix, columns = sorted(tagsdict.keys(), key=lambda x:tagsdict[x]))
transmatrix

Unnamed: 0,DT,PRP$,VBN,JJ,MD,NNPS,NNP,NN,IN,JJS,...,FW,LS,VBG,TO,SYM,NNS,WDT,CC,VBP,JJR
0,0.002594,0.000324,0.010052,0.205739,0.001783,0.003405,0.128243,0.465305,0.011187,0.009079,...,0.000162,0.000162,0.011511,0.000486,0.000162,0.085441,0.000324,0.000811,0.001135,0.006485
1,0.001587,0.001587,0.009524,0.21746,0.001587,0.003175,0.052381,0.414286,0.003175,0.014286,...,0.001587,0.001587,0.007937,0.001587,0.001587,0.203175,0.001587,0.001587,0.001587,0.003175
2,0.076023,0.016244,0.02859,0.059779,0.0013,0.00065,0.025341,0.064327,0.384665,0.00065,...,0.00065,0.0013,0.021442,0.109812,0.00065,0.045484,0.0013,0.020143,0.002599,0.004548
3,0.006887,0.000459,0.003214,0.078053,0.000918,0.001148,0.03742,0.450184,0.062672,0.000689,...,0.00023,0.00023,0.005739,0.02686,0.00023,0.258035,0.00023,0.021809,0.001607,0.000689
4,0.008571,0.001429,0.001429,0.001429,0.001429,0.001429,0.002857,0.001429,0.002857,0.001429,...,0.001429,0.001429,0.002857,0.004286,0.001429,0.002857,0.001429,0.001429,0.001429,0.001429
5,0.027322,0.010929,0.016393,0.010929,0.038251,0.010929,0.338798,0.054645,0.092896,0.005464,...,0.005464,0.005464,0.016393,0.016393,0.005464,0.021858,0.016393,0.071038,0.043716,0.005464
6,0.034169,0.000587,0.006599,0.02537,0.010265,0.016718,0.466051,0.083883,0.057633,0.001466,...,0.000147,0.000147,0.005133,0.008212,0.000293,0.029623,0.006893,0.049861,0.006159,0.001027
7,0.028111,0.001321,0.012854,0.018861,0.018381,0.00024,0.032436,0.162422,0.290605,0.001081,...,0.00036,0.00024,0.015978,0.043128,0.00012,0.099471,0.015257,0.055022,0.006247,0.001922
8,0.320426,0.036023,0.005262,0.098759,0.00027,0.002294,0.153265,0.106449,0.016595,0.004587,...,0.00027,0.000135,0.03238,0.002428,0.000135,0.065704,0.004047,0.001349,0.000405,0.006476
9,0.024242,0.006061,0.024242,0.115152,0.006061,0.012121,0.018182,0.230303,0.133333,0.006061,...,0.006061,0.006061,0.024242,0.006061,0.006061,0.145455,0.006061,0.012121,0.012121,0.006061


Unnamed: 0,DT,PRP$,VBN,JJ,MD,NNPS,NNP,NN,IN,JJS,...,FW,LS,VBG,TO,SYM,NNS,WDT,CC,VBP,JJR
0,0.002594,0.000324,0.010052,0.205739,0.001783,0.003405,0.128243,0.465305,0.011187,0.009079,...,0.000162,0.000162,0.011511,0.000486,0.000162,0.085441,0.000324,0.000811,0.001135,0.006485
1,0.001587,0.001587,0.009524,0.21746,0.001587,0.003175,0.052381,0.414286,0.003175,0.014286,...,0.001587,0.001587,0.007937,0.001587,0.001587,0.203175,0.001587,0.001587,0.001587,0.003175
2,0.076023,0.016244,0.02859,0.059779,0.0013,0.00065,0.025341,0.064327,0.384665,0.00065,...,0.00065,0.0013,0.021442,0.109812,0.00065,0.045484,0.0013,0.020143,0.002599,0.004548
3,0.006887,0.000459,0.003214,0.078053,0.000918,0.001148,0.03742,0.450184,0.062672,0.000689,...,0.00023,0.00023,0.005739,0.02686,0.00023,0.258035,0.00023,0.021809,0.001607,0.000689
4,0.008571,0.001429,0.001429,0.001429,0.001429,0.001429,0.002857,0.001429,0.002857,0.001429,...,0.001429,0.001429,0.002857,0.004286,0.001429,0.002857,0.001429,0.001429,0.001429,0.001429
5,0.027322,0.010929,0.016393,0.010929,0.038251,0.010929,0.338798,0.054645,0.092896,0.005464,...,0.005464,0.005464,0.016393,0.016393,0.005464,0.021858,0.016393,0.071038,0.043716,0.005464
6,0.034169,0.000587,0.006599,0.02537,0.010265,0.016718,0.466051,0.083883,0.057633,0.001466,...,0.000147,0.000147,0.005133,0.008212,0.000293,0.029623,0.006893,0.049861,0.006159,0.001027
7,0.028111,0.001321,0.012854,0.018861,0.018381,0.00024,0.032436,0.162422,0.290605,0.001081,...,0.00036,0.00024,0.015978,0.043128,0.00012,0.099471,0.015257,0.055022,0.006247,0.001922
8,0.320426,0.036023,0.005262,0.098759,0.00027,0.002294,0.153265,0.106449,0.016595,0.004587,...,0.00027,0.000135,0.03238,0.002428,0.000135,0.065704,0.004047,0.001349,0.000405,0.006476
9,0.024242,0.006061,0.024242,0.115152,0.006061,0.012121,0.018182,0.230303,0.133333,0.006061,...,0.006061,0.006061,0.024242,0.006061,0.006061,0.145455,0.006061,0.012121,0.012121,0.006061


In [41]:
print('The probability of DT occuring after NN is : ', TMatrix[tagsdict['NN']][tagsdict['DT']])

The probability of DT occuring after NN is :  0.028111484863046612
The probability of DT occuring after NN is :  0.028111484863046612



The probability of the tag sequence "PP VB NN" for the word sequence "I like food"?

P(START) P(PP|START) P(I | PP) P(VB | PP) P(like | VB) P(NN | VB) P(food | NN) * P(END | VB)

In [42]:

p_start = pi[tagsdict['PRP']]
p_vb_prp = TMatrix[tagsdict['PRP']][tagsdict['VB']]
p_love_vb = EMatrix[tagsdict['VB']][wordsdict['like']]
p_nn_vb = TMatrix[tagsdict['VB']][tagsdict['NN']]
p_food_nn = EMatrix[tagsdict['NN']][wordsdict['food']]

In [43]:
print("Probablity of POS tag PP VB NN is :", p_start*p_vb_prp*p_love_vb*p_nn_vb*p_food_nn)

Probablity of POS tag PP VB NN is : [3.10220267e-10]
Probablity of POS tag PP VB NN is : [3.10220267e-10]


## Viterbi

In [44]:
def viterbi(sentence, utags = list(utags)):
    
    state = []
    for i, word in enumerate(sentence):
        
        p = []
        for tag in utags:
            if i == 0:
                tp = pi[tagsdict[tag]]
            else:
                tp = TMatrix[tagsdict[state[-1]]][tagsdict[tag]]
            # compute emission and state probabilities
            if word not in uwords: ##handling unknown word with default probablity
                ep = 0.0001
            else:
                ep = EMatrix[tagsdict[tag]][wordsdict[word]]
            state_prob = tp*ep
            p.append(state_prob)
            
        #get state or which prob is max
        pmax = max(p)
        state_max = utags[p.index(pmax)] 
        state.append(state_max)
        
    return state
    
        
    
    

In [45]:
print(viterbi(['i', 'love', 'to','eat','food']))

['PRP', 'VBD', 'TO', 'VB', 'NN']
['PRP', 'VBD', 'TO', 'VB', 'NN']


In [46]:
len(test_data),test_data[0]

(783,
 [['the', 'DT'],
  ['latest', 'JJS'],
  ['10year', 'JJ'],
  ['notes', 'NNS'],
  ['were', 'VBD'],
  ['quoted', 'VBN'],
  ['at', 'IN'],
  ['100', 'CD'],
  ['22/32', 'CD'],
  ['to', 'TO'],
  ['yield', 'VB'],
  ['788%', 'CD'],
  ['compared', 'VBN'],
  ['with', 'IN'],
  ['100', 'CD'],
  ['16/32', 'CD'],
  ['to', 'TO'],
  ['yield', 'VB'],
  ['790%', 'CD']])

(783,
 [['the', 'DT'],
  ['latest', 'JJS'],
  ['10year', 'JJ'],
  ['notes', 'NNS'],
  ['were', 'VBD'],
  ['quoted', 'VBN'],
  ['at', 'IN'],
  ['100', 'CD'],
  ['22/32', 'CD'],
  ['to', 'TO'],
  ['yield', 'VB'],
  ['788%', 'CD'],
  ['compared', 'VBN'],
  ['with', 'IN'],
  ['100', 'CD'],
  ['16/32', 'CD'],
  ['to', 'TO'],
  ['yield', 'VB'],
  ['790%', 'CD']])

In [47]:
def test_viterbi(test_dataset,  utags = list(utags)):
    total_pred = 0
    correct_pred = 0
    for sent in test_dataset:
        words = []
        true_tags = []
        for word in sent:
            words.append(word[0])
            true_tags.append(word[1])
        pred = viterbi(words, utags)
        for i in range(len(pred)):
            if pred[i] == true_tags[i]:
                correct_pred +=1
                total_pred +=1
            else:
                total_pred +=1
    accuracy = (correct_pred/total_pred)*100
    return accuracy

In [48]:
accuracy = test_viterbi(test_data)
print(accuracy)

74.48483297068988
74.48483297068988


In [49]:
def class_wise_accuracy(test_dataset, utags = list(utags)):
    accuracy = np.zeros((len(utags),2))
    for sent in test_dataset:
        words = []
        true_tags = []
        for word, tag in sent:
            words.append(word)
            true_tags.append(tag)
        pred = viterbi(words)
        
        for i,tag in enumerate(true_tags):
            if tag in unwanted_tags:
                continue
            if pred[i] == true_tags[i]:
                accuracy[tagsdict[tag]][0]+=1;
                accuracy[tagsdict[tag]][1]+=1;
            else:
                accuracy[tagsdict[tag]][1]+=1;
        ans = np.divide(accuracy[:,0],accuracy[:,1])
    return ans
            
    
    


In [50]:
accuracy = class_wise_accuracy(test_data)



  ans = np.divide(accuracy[:,0],accuracy[:,1])
  ans = np.divide(accuracy[:,0],accuracy[:,1])


In [51]:
class_accuracy = pd.DataFrame(data = accuracy, index =sorted(tagsdict.keys(), key=lambda x:tagsdict[x]) ,columns = ['Accuracy'])
class_accuracy

Unnamed: 0,Accuracy
DT,0.983161
PRP$,0.933333
VBN,0.495455
JJ,0.609152
MD,0.787879
NNPS,0.032258
NNP,0.544268
NN,0.811384
IN,0.980444
JJS,0.105263


Unnamed: 0,Accuracy
DT,0.983161
PRP$,0.933333
VBN,0.495455
JJ,0.609152
MD,0.787879
NNPS,0.032258
NNP,0.544268
NN,0.811384
IN,0.980444
JJS,0.105263


In [52]:
#Collapse 36 tags into  tags
collapse = {}
for tag in utags:
    if tag[0] == 'V':
        collapse[tag] = 'V'
    elif tag[0] == 'N':
        collapse[tag] = 'N'
    elif tag[0] == 'J' or tag[0] == 'R':
        collapse[tag] = 'A'
    else:
        collapse[tag] = 'O'
otags = utags       
utags = set(['V','N','A','O'])
tagsdict={i:ind for ind,i in enumerate(utags)}

In [53]:
#Generate Inititial , transition and emmision probablity matrxes
nw = len(uwords)
nt = len(utags)
prev=None
next=None
pi=np.ones((nt,1),dtype='float64') # pi initial probabilities
TMatrix=np.ones((nt,nt),dtype='float64') # Transition matrix
EMatrix=np.ones((nt,nw),dtype='float64') # Emmision matrix
for i in train_data:
    for ind,[word,tag] in enumerate(i):
        if tag not in otags:
            continue
        tag = collapse[tag]
        if ind==0:
            pi[tagsdict[tag]][0]+=1
            prev=tag
            continue
        next=tag
        TMatrix[tagsdict[prev]][tagsdict[next]]+=1
        EMatrix[tagsdict[tag]][wordsdict[word]]+=1
        prev=tag
        
pi=pi/pi.sum(axis=1)[:,np.newaxis]
TMatrix=TMatrix/TMatrix.sum(axis=1)[:,np.newaxis]
EMatrix=EMatrix/EMatrix.sum(axis=1)[:,np.newaxis]

In [54]:
pi.shape,TMatrix.shape, EMatrix.shape

((4, 1), (4, 4), (4, 10348))

((4, 1), (4, 4), (4, 10348))

In [55]:
TMatrix

array([[0.301047  , 0.1667632 , 0.14827356, 0.38391624],
       [0.52933464, 0.15382935, 0.16661204, 0.15022397],
       [0.23499343, 0.11260406, 0.13275887, 0.51964364],
       [0.4056363 , 0.18273689, 0.05344585, 0.35818097]])

array([[0.301047  , 0.1667632 , 0.14827356, 0.38391624],
       [0.52933464, 0.15382935, 0.16661204, 0.15022397],
       [0.23499343, 0.11260406, 0.13275887, 0.51964364],
       [0.4056363 , 0.18273689, 0.05344585, 0.35818097]])

In [58]:
sentence = ['i', 'love', 'to','eat','food']
print(viterbi(sentence, utags = list(utags)))

['O', 'N', 'O', 'N', 'N']
['O', 'N', 'O', 'N', 'N']


In [60]:
accuracy = test_viterbi(test_data,utags = list(utags))
print('Accuracy = ', accuracy)

Accuracy =  0.0
Accuracy =  0.0
