<a href="https://colab.research.google.com/github/lorek/MethodsClassDimRed/blob/main/MoCaDR_List_nr_9_Hidden_Markov_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hidden Markov Models for "Parts of speach tagging"

In [None]:
# Paweł Lorek

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.tag import hmm
import time



## Parts of speach tagging:

\begin{array}{ccccc}
\textrm{The} & \textrm{dog} & \textrm{ate} & \textrm{the} & \textrm{cat} \\
\downarrow & \downarrow & \downarrow & \downarrow & \downarrow\\
\textrm{DT} & \textrm{NN} & \textrm{VBD} & \textrm{DT} & \textrm{NN}
\end{array}

So-called **Penn's tags** (See https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html or https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)

\begin{array}{ll|ll}
1. & \textrm{CC}   & \textrm{Coordinating conjunction}   & 19. & \textrm{PRP\$} & \textrm{Possessive pronoun} \\
2. & \textrm{CD}   & \textrm{Cardinal number}            & 20. & \textrm{RB}    & \textrm{Adverb} \\
3. & \textrm{DT}   & \textrm{Determiner}                 & 21. & \textrm{RBR}   & \textrm{Adverb, comparative} \\
4. & \textrm{EX}   & \textrm{Existential }  there         & 22. & \textrm{RBS}   & \textrm{Adverb, superlative} \\
5. & \textrm{FW}   & \textrm{Foreign word}               & 23. & \textrm{RP}    & \textrm{Particle} \\
6. & \textrm{IN}   & \textrm{Preposition or subordinating conjunction} & 24. & \textrm{SYM}   & \textrm{Symbol} \\
7. & \textrm{JJ}   & \textrm{Adjective}                  & 25. & \textrm{TO}    & {to} \\
8. & \textrm{JJR}  & \textrm{Adjective, comparative}     & 26. & \textrm{UH}    & \textrm{Interjection} \\
9. & \textrm{JJS}  & \textrm{Adjective, superlative}     & 27. & \textrm{VB}    & \textrm{Verb, base form} \\
10. & \textrm{LS}  & \textrm{List item marker}           & 28. & \textrm{VBD}   & \textrm{Verb, past tense} \\
11. & \textrm{MD}  & \textrm{Modal}                      & 29. & \textrm{VBG}   & \textrm{Verb, gerund or present participle} \\
12. & \textrm{NN}  & \textrm{Noun, singular or mass}     & 30. & \textrm{VBN}   & \textrm{Verb, past participle} \\
13. & \textrm{NNS} & \textrm{Noun, plural}               & 31. & \textrm{VBP}   & \textrm{Verb, non-3rd person singular present} \\
14. & \textrm{NNP} & \textrm{Proper noun, singular}      & 32. & \textrm{VBZ}   & \textrm{Verb, 3rd person singular present} \\
15. & \textrm{NNPS}& \textrm{Proper noun, plural}        & 33. & \textrm{WDT}   & \textrm{Wh-determiner} \\
16. & \textrm{PDT} & \textrm{Predeterminer}              & 34. & \textrm{WP}    & \textrm{Wh-pronoun} \\
17. & \textrm{POS} & \textrm{Possessive ending}          & 35. & \textrm{WP\$}  & \textrm{Possessive wh-pronoun} \\
18. & \textrm{PRP} & \textrm{Personal pronoun}           & 36. & \textrm{WRB}   & \textrm{Wh-adverb} \\
\end{array}


Read data, split into training and test sets

In [None]:
#'Brown' corpus
nltk.download('brown')

#'Tree bank' corpus
nltk.download('treebank')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
# take only 40k

all_data = list(brown.tagged_sents()[:40000])
print("len(all_data) = ", len(all_data))
#all_data = list(treebank.tagged_sents()[:5000])

train_size = int(0.9*len(all_data))

len(all_data) =  40000


In [None]:
all_data[:2] # two sentences and corresponding speach tags

[[('The', 'AT'),
  ('Fulton', 'NP-TL'),
  ('County', 'NN-TL'),
  ('Grand', 'JJ-TL'),
  ('Jury', 'NN-TL'),
  ('said', 'VBD'),
  ('Friday', 'NR'),
  ('an', 'AT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NP$'),
  ('recent', 'JJ'),
  ('primary', 'NN'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'AT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'CS'),
  ('any', 'DTI'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'AT'),
  ('jury', 'NN'),
  ('further', 'RBR'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'NN'),
  ('presentments', 'NNS'),
  ('that', 'CS'),
  ('the', 'AT'),
  ('City', 'NN-TL'),
  ('Executive', 'JJ-TL'),
  ('Committee', 'NN-TL'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'HVD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'AT'),
  ('praise', 'NN'),
  ('and', 

## Porter Stemmer
**The Porter Stemming algorithm** (or **Porter Stemmer**) is used to remove the suffixes from an English word and obtain its stem which becomes very useful in the field of Information Retrieval (IR).

In [None]:
words = ["program", "programs", "programmer", "programming", "programmers", "likes", "liked","likely","liking"]

porter = PorterStemmer()

for w in words:
    print(w, " : ", porter.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm
likes  :  like
liked  :  like
likely  :  like
liking  :  like


Shuffling data and stemming:

In [None]:
random.shuffle(all_data)


start_time = time.time()



porter = PorterStemmer()
train_data = [ [(porter.stem(word.lower()), tag) for word, tag in sent] for sent in all_data[:train_size]]
test_data = [ [(porter.stem(word.lower()), tag) for word, tag in sent] for sent in all_data[train_size:]]
print("\t\t took %s seconds " % round((time.time() - start_time),5))

		 took 19.31378 seconds 


In [None]:
train_data[0]

[('electron', 'NN'),
 ('microscop', 'JJ'),
 ('examin', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('af', 'NN'),
 ('sampl', 'NN'),
 ('show', 'VBD'),
 ('it', 'PPO'),
 ('to', 'TO'),
 ('be', 'BE'),
 ('compos', 'VBN'),
 ('of', 'IN'),
 ('nearli', 'QL'),
 ('isotrop', 'JJ'),
 ('particl', 'NNS'),
 ('about', 'RB'),
 ('0.3m', 'NNS'),
 ('in', 'IN'),
 ('diamet', 'NN'),
 ('.', '.')]

## $n$-gram taggers:
"predict" tag for word $w$ using last $n$ words. E.g., for $n=2$, if we are to tag **ate** in sentence `The dog ate the cat`. Then we take into account words 'dog ate` and check what was most frequent tag for `ate` in training data where  `dog ate` appeared.

bigram tagger:

In [None]:
print("Calculating bigram tagger...", end="", flush=True)
start_time = time.time()
bigram_tagger = nltk.BigramTagger(train_data)
bigram_tagger_eval =   bigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("bigram tagger: ", bigram_tagger_eval)

Calculating bigram(frequency) tagger...		 took 3.68324 seconds 
bigram tagger:  0.30001211280323303


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_tagger_eval =   bigram_tagger.evaluate(test_data)


trigram tagger:

In [None]:
print("Calculating trigram tagger...", end="", flush=True)
start_time = time.time()
trigram_tagger = nltk.TrigramTagger(train_data)
trigram_tagger_eval =   trigram_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("trigram tagger: ", trigram_tagger_eval)

Calculating trigram tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  trigram_tagger_eval =   trigram_tagger.evaluate(test_data)


		 took 6.17889 seconds 
trigram tagger:  0.13882373668968098


# Hidden Markov Model tagger

Hidden states: tags
Observations: sentences



![picture](https://raw.githubusercontent.com/lorek/MethodsClassDimRed/main/figures/hmm_speach_tag.png)

**NOTE: may take ~ 5 min**

In [None]:
print("Calculating Hmm tagger...", end="", flush=True)
start_time = time.time()
hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_data)
hmm_tagger_eval = hmm_tagger.evaluate(test_data)
print("\t\t took %s seconds " % round((time.time() - start_time),5))
print("hmm_tagger: ",hmm_tagger_eval )

Calculating Hmm tagger...

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_tagger_eval = hmm_tagger.evaluate(test_data)
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])


		 took 280.38428 seconds 
hmm_tagger:  0.7256339951328554


In [None]:

print("Comparison:")
print("bigram tagger: \t\t", bigram_tagger_eval)
print("trigram tagger: \t", trigram_tagger_eval)
print("hmm_tagger: \t\t",hmm_tagger_eval )

Comparison:
bigram tagger: 		 0.30001211280323303
trigram tagger: 	 0.13882373668968098
hmm_tagger: 		 0.7256339951328554


# Q9.1
* We performed speech tagging for `brown` dataset. Do the same for `treebank` dataset. Compare results with and without stemming.