
### Use the function HiddenMarkovModelTagger.train() to estimate all probabilities for an HMM POS tagger from the MASC corpus.


In [1]:
## Initialization
#
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download("masc_tagged")



[nltk_data] Downloading package masc_tagged to
[nltk_data]     C:\Users\meiye\AppData\Roaming\nltk_data...
[nltk_data]   Package masc_tagged is already up-to-date!


True

In [3]:
## Train a HMM using the MASC corpus
##
from nltk.corpus import masc_tagged

s = masc_tagged.tagged_sents()  # List of sentences in word-tag pairs as training data


hmmTagger = nltk.tag.hmm.HiddenMarkovModelTagger.train(s)  # train hmmTagger
print(hmmTagger)

tagsent = hmmTagger.tag(['the', 'answer',  'is',  'blowing', 'in', 'the', 'wind', '.'])
tagsent

<HiddenMarkovModelTagger 60 states and 40333 output symbols>


[('the', 'DT'),
 ('answer', 'NN'),
 ('is', 'VBZ'),
 ('blowing', 'VBG'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('wind', 'NN'),
 ('.', '.')]

In [10]:
## Use the trained hmmTagger to tag some text (normal english)
##

txt = "She books the tickets to California. Then they buy the book, drop it to the back seat and go home. She also asks her mom to bring back the dogs."
txt_sen = sent_tokenize(txt)             # Tokenize into list of sentences (string)
txt_word = [tuple(word_tokenize(token)) for token in txt_sen]   # Tokenize into list of list of words

txtTag = [hmmTagger.tag(sen) for sen in txt_word]    #
print(txtTag)

[[('She', 'PRP'), ('books', 'VBP'), ('the', 'DT'), ('tickets', 'NNS'), ('to', 'TO'), ('California', 'NNP'), ('.', '.')], [('Then', 'RB'), ('they', 'PRP'), ('buy', 'VBP'), ('the', 'DT'), ('book', 'NN'), (',', ','), ('drop', 'VB'), ('it', 'PRP'), ('to', 'TO'), ('the', 'DT'), ('back', 'NN'), ('seat', 'NN'), ('and', 'CC'), ('go', 'VB'), ('home', 'NN'), ('.', '.')], [('She', 'PRP'), ('also', 'RB'), ('asks', 'VBZ'), ('her', 'PRP$'), ('mom', 'NN'), ('to', 'TO'), ('bring', 'VB'), ('back', 'RB'), ('the', 'DT'), ('dogs', 'NNS'), ('.', '.')]]


In [125]:
## Use tagger to tag another not-so-normal english text
##
txt2 = 'Misjoggle in a gripty hifnipork . One fretigy kriptog is always better than several intersplicks .'

txt_sen2 = sent_tokenize(txt2)
txt_word2 = [tuple(word_tokenize(sen)) for sen in txt_sen2]

txtTag2 = [hmmTagger.tag(sen) for sen in txt_word2]
txtTag2

[[('Misjoggle', 'RB'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('gripty', 'JJ'),
  ('hifnipork', 'NNS'),
  ('.', '.')],
 [('One', 'CD'),
  ('fretigy', 'NNS'),
  ('kriptog', 'WDT'),
  ('is', 'VBZ'),
  ('always', 'RB'),
  ('better', 'JJR'),
  ('than', 'IN'),
  ('several', 'JJ'),
  ('intersplicks', 'NNS'),
  ('.', '.')]]