# Unsupervised Hidden Markov Model

### Author: Omer Waseem
#### Description: This Python notebook trains and evaluates a HMM using the CoNLL dataset

In [1]:
from dataprep import conll_words
from helper import accuracy, entity_count
import nltk
from sklearn.metrics import precision_recall_fscore_support

### Load CoNLL data from files

In [2]:
train_file = './datasets/CoNLL2003/eng.train'
testa_file = './datasets/CoNLL2003/eng.testa'
testb_file = './datasets/CoNLL2003/eng.testb'
testc_file = './datasets/CoNLL2003/eng.testc'

train_words, _, _, train_entities = conll_words(train_file)
testa_words, _, _, testa_entities = conll_words(testa_file)
testb_words, _, _, testb_entities = conll_words(testb_file)
testc_words, _, _, testc_entities = conll_words(testc_file)

### Combine training and testing datasets to form vocabulary and entitiy sets

In [3]:
combined_words = train_words + testa_words + testb_words + testc_words
combined_entities = train_entities + testa_entities + testb_entities + testc_entities

char_set = set()
for word in combined_words:
    for char in word:
        char_set.add(char)
entity_set = set(combined_entities)

### Train Unsupervised HMM
#### Note: testa is used for training since it is smaller in size, and unsupervised learning is used. This is to allow for more training interations.

In [4]:
trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=entity_set, symbols=char_set)
model = trainer.train_unsupervised(testa_words, max_iterations=100)

iteration 0 logprob -1426557.50223
iteration 1 logprob -1117558.59144
iteration 2 logprob -1110369.70708
iteration 3 logprob -1102886.49849
iteration 4 logprob -1094904.86769
iteration 5 logprob -1085588.68258
iteration 6 logprob -1073468.57339
iteration 7 logprob -1059869.72322
iteration 8 logprob -1048842.1155
iteration 9 logprob -1041855.9449
iteration 10 logprob -1036274.63472
iteration 11 logprob -1030640.56712
iteration 12 logprob -1024878.88372
iteration 13 logprob -1019548.33273
iteration 14 logprob -1015174.90795
iteration 15 logprob -1011786.20334
iteration 16 logprob -1009071.15325
iteration 17 logprob -1006673.13257
iteration 18 logprob -1004356.02605
iteration 19 logprob -1002096.77448
iteration 20 logprob -999971.392286
iteration 21 logprob -997999.821362
iteration 22 logprob -996151.791588
iteration 23 logprob -994426.07502
iteration 24 logprob -992869.202776
iteration 25 logprob -991524.965157
iteration 26 logprob -990393.249284
iteration 27 logprob -989438.716187
itera

### Evaluate on testb

In [5]:
testb_result = model.tag(testb_words)

In [7]:
testb_result[0:5]

[('SOCCER', 'LOC'),
 ('-', 'LOC'),
 ('JAPAN', 'LOC'),
 ('GET', 'LOC'),
 ('LUCKY', 'LOC')]

In [8]:
testb_predicted = []
for word, entity in testb_result:
    testb_predicted.append(entity)

In [12]:
accuracy(testb_entities, testb_predicted)

accuracy = 1925 / 46435 = 0.041456


In [13]:
precision_recall_fscore_support(testb_entities, testb_predicted)

  'precision', 'predicted', average, warn_for)


(array([ 0.0414558,  0.       ,  0.       ,  0.       ,  0.       ]),
 array([ 1.,  0.,  0.,  0.,  0.]),
 array([ 0.07961125,  0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 1925,   918, 38323,  2496,  2773]))

In [14]:
entity_count(testb_entities)

ORG: 2496
PER: 2773
LOC: 1925
MISC: 918
O: 38323
