In [1]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

In [2]:
sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor 
Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, 
in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon 
and about 30 invited guests, on July 03, 2013.
'''

POST Tag followed by NE Chunk

In [3]:
# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

[('Professor', 'NNP'),
 ('Tan', 'NNP'),
 ('Eng', 'NNP'),
 ('Chye', 'NNP'),
 (',', ','),
 ('NUS', 'NNP'),
 ('Deputy', 'NNP'),
 ('President', 'NNP'),
 ('and', 'CC'),
 ('Provost', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('Professor', 'NNP'),
 ('Menahem', 'NNP'),
 ('Ben-Sasson', 'NNP'),
 (',', ','),
 ('President', 'NNP'),
 ('of', 'IN'),
 ('HUJ', 'NNP'),
 ('signed', 'VBD'),
 ('the', 'DT'),
 ('joint', 'JJ'),
 ('degree', 'NN'),
 ('agreement', 'NN'),
 ('at', 'IN'),
 ('NUS', 'NNP'),
 (',', ','),
 ('in', 'IN'),
 ('the', 'DT'),
 ('presence', 'NN'),
 ('of', 'IN'),
 ('Ambassador', 'NNP'),
 ('of', 'IN'),
 ('Israel', 'NNP'),
 ('to', 'TO'),
 ('Singapore', 'NNP'),
 ('Her', 'NNP'),
 ('Excellency', 'NNP'),
 ('Amira', 'NNP'),
 ('Arnon', 'NNP'),
 ('and', 'CC'),
 ('about', 'IN'),
 ('30', 'CD'),
 ('invited', 'JJ'),
 ('guests', 'NNS'),
 (',', ','),
 ('on', 'IN'),
 ('July', 'NNP'),
 ('03', 'CD'),
 (',', ','),
 ('2013', 'CD'),
 ('.', '.')]

In [4]:
# ===== NER using NLTK =====
# The input for the NE chunker needs to have POS tags.
sent_chunk = ne_chunk(sent_pos)
print(sent_chunk)

(S
  Professor/NNP
  Tan/NNP
  Eng/NNP
  Chye/NNP
  ,/,
  (ORGANIZATION NUS/NNP)
  Deputy/NNP
  President/NNP
  and/CC
  (ORGANIZATION Provost/NNP)
  ,/,
  and/CC
  (ORGANIZATION Professor/NNP Menahem/NNP)
  Ben-Sasson/NNP
  ,/,
  President/NNP
  of/IN
  (ORGANIZATION HUJ/NNP)
  signed/VBD
  the/DT
  joint/JJ
  degree/NN
  agreement/NN
  at/IN
  (ORGANIZATION NUS/NNP)
  ,/,
  in/IN
  the/DT
  presence/NN
  of/IN
  (ORGANIZATION Ambassador/NNP)
  of/IN
  (GPE Israel/NNP)
  to/TO
  (GPE Singapore/NNP)
  Her/NNP
  Excellency/NNP
  (PERSON Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  July/NNP
  03/CD
  ,/,
  2013/CD
  ./.)


In [19]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

#NE = Name Entity
#NP = normal phrase . 
# if its more than one pattern then put more curly bracket
#NNP =  <>will try to match words
# instead of {<DT><NN>} {<DT><JJ>?<NN>}
#for month create a new label list of Months and do POS Tagging again
grammar = r"""
  NE: {<NNP>+}      # chunk sequences of proper nouns
  NP:                 
      {<DT><JJ>?<NN>}     
"""

cp = nltk.RegexpParser(grammar)
print(cp.parse(sent_pos))

(S
  (NE Professor/NNP Tan/NNP Eng/NNP Chye/NNP)
  ,/,
  (NE NUS/NNP Deputy/NNP President/NNP)
  and/CC
  (NE Provost/NNP)
  ,/,
  and/CC
  (NE Professor/NNP Menahem/NNP Ben-Sasson/NNP)
  ,/,
  (NE President/NNP)
  of/IN
  (NE HUJ/NNP)
  signed/VBD
  (NP the/DT joint/JJ degree/NN)
  agreement/NN
  at/IN
  (NE NUS/NNP)
  ,/,
  in/IN
  (NP the/DT presence/NN)
  of/IN
  (NE Ambassador/NNP)
  of/IN
  (NE Israel/NNP)
  to/TO
  (NE Singapore/NNP Her/NNP Excellency/NNP Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  (NE July/NNP)
  03/CD
  ,/,
  2013/CD
  ./.)


In [None]:
#------------------------------------------------------------------------
# Exercise: modify the above tag patterns to capture the NEs and NPs in the 
# example sentence. 
#-------------------------------------------------------------------------