In [1]:
import pandas as pd
from collections import Counter as ctr


# Intro

The following few cells are meant to import the training data and briefly inspect it

In [2]:
train = pd.read_csv('train.txt', delimiter = " ", names=['word', 'pos', 'other'], header=None)
train['word'] = train.word.str.lower()

In [3]:
train.head()

Unnamed: 0,word,pos,other
0,confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP


# Preprocessing

Editing the training data to make it useful for transmission


In [4]:
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('0')
train.head()

Unnamed: 0,word,pos,other,ppos
0,confidence,NN,B-NP,0
1,in,IN,B-PP,NN
2,the,DT,B-NP,IN
3,pound,NN,I-NP,DT
4,is,VBZ,B-VP,NN


# Methods for Tagging

Defining Ptt and Pwt
and the emission/transmission methods for greedy

In [5]:
smooth = 0.00001
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)
    

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

In [6]:
Ptt(Ti='NN',Tprev='DT')

0.48453776929370057

In [7]:
pos = list(set(train.pos))
def emission(W=''):
    return[[T,Pwt(W,T)] for T in pos]
    
def transmission(X=""):
    return[(T, Ptt(T,X))  for T in pos]

#Importing data to test on

Importing the data and preprocessing it. 

In [8]:
test = pd.read_csv('test.txt', delimiter=' ', names=['word', 'pos', 'other'])
test['word'] = test.word.str.lower()

# Greedy Tagger

This is the greedy version of the part of speech tagger

In [9]:
count = 0
hyp = []
for word in test['word']:
    #print(word)
    if count == 0:
        tag = max(emission(word))
        count += 1
        hyp.append(tag[0])
    else:
        #print(tag)
        tprev = list(tag)
        emissions = emission(word)
        transmissions = transmission(tprev[0])
        maxTag = 0
        for k, k2 in zip(emissions,transmissions):
            #print(k[1], k2[1])
            if((k[1]*k2[1]) > maxTag):
                maxTag = k[1]*k2[1]
                tag = k[0],(k[1]*k2[1])
        #print(tag[0])
        hyp.append(tag[0])
        

# Post Processing

Combining results with data frame and testing

In [10]:
hper = pd.DataFrame(hyp)
test['hyp'] = hper
test.head()

Unnamed: 0,word,pos,other,hyp
0,rockwell,NNP,B-NP,``
1,international,NNP,I-NP,NNP
2,corp.,NNP,I-NP,NNP
3,'s,POS,B-NP,POS
4,tulsa,NNP,I-NP,NNP


In [11]:
from sklearn.metrics import accuracy_score

accuracy_score(test.pos, test.hyp)


0.9055870992253625

# Confusion Matrix

Made the matrix and evaluated

In [12]:
matrix = pd.crosstab(test.pos, test.hyp, rownames=['True'], colnames=['Predicted'], margins=True)

In [13]:
for part in sorted(pos):
    if part in matrix:
        precision = matrix[part][part]/matrix[part]['All']
        print (part)
        print(precision)
        #print(matrix[part][part])
        #print(matrix[part]['All'])

#
1.0
$
1.0
''
0.9968253968253968
(
1.0
)
1.0
,
0.9983291562238931
.
1.0
:
1.0
CC
0.9967051070840197
CD
0.9948834565093804
DT
0.7989152269987947
EX
0.9791666666666666
FW
1.0
IN
0.8907413976587443
JJ
0.875733137829912
JJR
0.8387096774193549
JJS
0.9863013698630136
MD
0.985200845665962
NN
0.8448616600790514
NNP
0.8614177090998283
NNPS
0.7272727272727273
NNS
0.9595775673707211
PDT
0.4
POS
0.9923076923076923
PRP
0.9564705882352941
PRP$
1.0
RB
0.9376996805111821
RBR
0.7446808510638298
RBS
1.0
RP
0.25
TO
1.0
UH
1.0
VB
0.9054263565891473
VBD
0.9256550883607556
VBG
0.9690553745928339
VBN
0.8855975485188968
VBP
0.9048582995951417
VBZ
0.9214365881032548
WDT
0.905511811023622
WP
1.0
WP$
1.0
WRB
1.0
``
0.9969135802469136


In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(test.pos, test.hyp)

0.9055870992253625

# Viterbi

Define the method and implement

In [15]:
def viterbi(obs):
    opt= []
    V = [{}]
    for st in pos:
        V[0][st] = {"prob": Pwt(obs[0],st), "prev": None}
    for t in range(1, len(obs)):
        V.append({})
        for st in pos:
                #print(V[t-1][pos[0]]["prob"])
            max_tr_prob = V[t-1][pos[0]]["prob"]*Ptt(pos[0],st)
            prev_st_selected = pos[0]
            for prev_st in pos[1:]:
                    #print(V[t-1][prev_st]["prob"])
                tr_prob = V[t-1][prev_st]["prob"]*Ptt(st, prev_st)
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st

            max_prob = max_tr_prob * Pwt(obs[t],st)
            V[t][st] = {"prob": max_prob, "prev": prev_st_selected}
    max_prob = max(value["prob"] for value in V[-1].values())
    previous = None
        # Get most probable state and its backtrack
    for st, data in V[-1].items():
        if data["prob"] == max_prob:
            opt.append(st)
            previous = st
            break
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]
            
    return opt 

# Preprocessing test words and testing them

In [16]:
sentence =[]
sentences= []
for word in test['word']:
    sentence.append(word)
    if word == '.':
        sentences.append(sentence)
        sentence = []

In [17]:
opt = []
for sentence in sentences:
    opt.append(viterbi(sentence))

In [18]:
#print(opt)
opt2 = []
for sentence in opt:
    for word in sentence:
        opt2.append(word)

In [19]:
hper = pd.DataFrame(opt2)
test['hyp'] = hper


In [20]:
accuracy_score(test.pos, test.hyp)

0.9208476687000021

In [21]:
matrix = pd.crosstab(test.pos, test.hyp, rownames=['True'], colnames=['Predicted'], margins=True)
for part in sorted(pos):
    if part !=  'SYM':
        precision = matrix[part][part]/matrix[part]['All']
        print (part)
        print(precision)
        #print(matrix[part][part])
        #print(matrix[part]['All'])

#
0.4782608695652174
$
0.9297820823244553
''
0.9936708860759493
(
1.0
)
0.9746835443037974
,
0.9719398129320862
.
0.9889834752128193
:
0.9957805907172996
CC
0.9967078189300411
CD
0.9741282339707537
DT
0.8953044259716918
EX
0.8135593220338984
FW
0.037037037037037035
IN
0.9397136397889977
JJ
0.8593231585932316
JJR
0.9463414634146341
JJS
0.9166666666666666
MD
0.975
NN
0.8720694645441389
NNP
0.8855015126832674
NNPS
0.7171717171717171
NNS
0.9632488055861815
PDT
0.4090909090909091
POS
0.9907621247113164
PRP
0.949648711943794
PRP$
1.0
RB
0.933641975308642
RBR
0.9354838709677419
RBS
0.8431372549019608
RP
0.2727272727272727
TO
0.9792186201163757
UH
0.047619047619047616
VB
0.8996913580246914
VBD
0.938949938949939
VBG
0.966996699669967
VBN
0.855410447761194
VBP
0.8984375
VBZ
0.9542857142857143
WDT
0.8767123287671232
WP
0.8870967741935484
WP$
0.2
WRB
1.0
``
0.9847560975609756


# Results

As to be expected the viterbi runs slightly better than the greedy algorithm. The confusion matrix shows that WP$ was the hardest part of speech to identify. 