### Regular Expression Tagger

In [1]:
from nltk import RegexpTagger
from nltk.corpus import brown


In [2]:
# loading the brown dataset without the tags
brown_sents = brown.sents(categories='news') 
patterns = [
     (r'.*ing$', 'VBG'),               # gerunds
     (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN')                     # nouns (default)
 ]

In [3]:
regexp_tagger = RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD'),
 ("''", 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('jury', 'NN'),
 ('said', 'NN'),
 (',', 'NN'),
 ('``', 'NN'),
 ('considering', 'VBG'),
 ('the', 'NN'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('election', 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('number', 'NN'),
 ('of', 'NN'),
 ('voters', 'NNS'),
 ('and', 'NN'),
 ('the', 'NN'),
 ('size', 'NN'),
 ('of', 'NN'),
 ('this', 'NNS'),
 ('city', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [4]:
# loading the brown dataset without the tags
brown_tagged_sents = brown.tagged_sents(categories='news')

In [5]:
print('The same sentence in same word order in both the tagged and untagged brown dataset\n',brown_sents[1][:5])
print('\n',brown_tagged_sents[1][:5])

The same sentence in same word order in both the tagged and untagged brown dataset
 ['The', 'jury', 'further', 'said', 'in']

 [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN')]


&nbsp;
### search() Vs match()

In [6]:
import re
print('1)',re.match("d","dog"))     # Match at index 0
print('2)',re.match("g","dog"))  # No match; not at the begining of the string

1) <_sre.SRE_Match object; span=(0, 1), match='d'>
2) None


In [7]:
print('1)',re.search("d","dog"))     # Match at index 0
print('2)',re.search("g","dog"))  # Match at index 2

1) <_sre.SRE_Match object; span=(0, 1), match='d'>
2) <_sre.SRE_Match object; span=(2, 3), match='g'>


In [8]:
st = 'I am very Happy, Salem'
print(re.findall(r'\b[A-Z]\w*','I am very Happy, Salem  HappyLife'))

['I', 'Happy', 'Salem', 'HappyLife']


&nbsp;
### Confusion Matrix

In [9]:
from nltk.metrics import ConfusionMatrix

#loading the tagged and untagged brown tagset
brown_sents = brown.sents(categories='news')
brown_tagged_sents = brown.tagged_sents(categories='news')

#tagging sent in idx 3
regex_tags = regexp_tagger.tag(brown_sents[3])

ref_tags = [tag for (word,tag) in brown_tagged_sents[3]]
pred_tags = [tag for (word,tag) in regex_tags]
cm = ConfusionMatrix(ref_tags,pred_tags)
print(cm)

     |         B                       |
     |         E           N   V V V   |
     | '     A D C D I J N N R B B B ` |
     | ' , . T Z C T N J N S B D G N ` |
-----+---------------------------------+
  '' |<.>. . . . . . . . 2 . . . . . . |
   , | .<.>. . . . . . . 3 . . . . . . |
   . | . .<.>. . . . . . 1 . . . . . . |
  AT | . . .<.>. . . . . 6 . . . . . . |
BEDZ | . . . .<.>. . . . . 1 . . . . . |
  CC | . . . . .<.>. . . 1 . . . . . . |
  DT | . . . . . .<.>. . . 1 . . . . . |
  IN | . . . . . . .<.>. 4 . . . 1 . . |
  JJ | . . . . . . . .<.>3 . . . . . . |
  NN | . . . . . . . . .<7>. . . . . . |
 NNS | . . . . . . . . . .<2>. . . . . |
  RB | . . . . . . . . . 1 .<.>. . . . |
 VBD | . . . . . . . . . 1 . .<.>. . . |
 VBG | . . . . . . . . . . . . .<.>. . |
 VBN | . . . . . . . . . . . . 1 .<.>. |
  `` | . . . . . . . . . 2 . . . . .<.>|
-----+---------------------------------+
(row = reference; col = test)



&nbsp;
#### Links / References

https://www.mathsisfun.com/numbers/cardinal-ordinal-nominal.html

http://www.nltk.org/book/ch05.html

https://docs.python.org/3/library/re.html#search-vs-match
