In [1]:
# Chunking

# Importing the libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

In [2]:
# Importing the data
dataset = """Taj Mahal is one of the world’s most celebrated structures 
          in the world.
          It is a stunning symbol of Indian rich history"""

In [3]:
# Tokenize the data
new_data = word_tokenize(dataset)
print(new_data)

['Taj', 'Mahal', 'is', 'one', 'of', 'the', 'world', '’', 's', 'most', 'celebrated', 'structures', 'in', 'the', 'world', '.', 'It', 'is', 'a', 'stunning', 'symbol', 'of', 'Indian', 'rich', 'history']


In [4]:
# Apply the POS Tagging
postagging = pos_tag(new_data)
print(postagging)

[('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('world', 'NN'), ('’', 'NNP'), ('s', 'VBZ'), ('most', 'JJS'), ('celebrated', 'JJ'), ('structures', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('stunning', 'JJ'), ('symbol', 'NN'), ('of', 'IN'), ('Indian', 'JJ'), ('rich', 'JJ'), ('history', 'NN')]


In [5]:
# Define the sequence of Chunk
sequence_chunk = """ 
           chunk:
               {<NNPS>+}
               {<NNP>+}
               {<NN>+} """

In [6]:
# Create object with Regular Expression               
chunk = RegexpParser(sequence_chunk)

In [7]:
# Final Step
chunk_result = chunk.parse(postagging)
print(chunk_result)

(S
  (chunk Taj/NNP Mahal/NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  (chunk world/NN)
  (chunk ’/NNP)
  s/VBZ
  most/JJS
  celebrated/JJ
  structures/NNS
  in/IN
  the/DT
  (chunk world/NN)
  ./.
  It/PRP
  is/VBZ
  a/DT
  stunning/JJ
  (chunk symbol/NN)
  of/IN
  Indian/JJ
  rich/JJ
  (chunk history/NN))
