### 44 - Niyati Patil

### Chunking by analyzing the importance of selecting proper features

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [2]:
from nltk.chunk import RegexpParser
from nltk.tokenize import word_tokenize

In [3]:
sentence = "Educative Answers is a free web encyclopedia written by devs for devs."

### Tokenization

In [4]:
tokens = word_tokenize(sentence)

In [5]:
tokens

['Educative',
 'Answers',
 'is',
 'a',
 'free',
 'web',
 'encyclopedia',
 'written',
 'by',
 'devs',
 'for',
 'devs',
 '.']

### POS tagging

In [6]:
pos_tags = nltk.pos_tag(tokens)

In [7]:
pos_tags

[('Educative', 'JJ'),
 ('Answers', 'NNPS'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('free', 'JJ'),
 ('web', 'NN'),
 ('encyclopedia', 'NN'),
 ('written', 'VBN'),
 ('by', 'IN'),
 ('devs', 'NN'),
 ('for', 'IN'),
 ('devs', 'NN'),
 ('.', '.')]

### Chunking patterns

In [8]:
chunk_patterns = r"""
    NP: {<DT>?<JJ>*<NN>}  # Chunk noun phrases
    VP: {<VB.*><NP|PP>}  # Chunk verb phrases
"""

In [9]:
chunk_patterns

'\n    NP: {<DT>?<JJ>*<NN>}  # Chunk noun phrases\n    VP: {<VB.*><NP|PP>}  # Chunk verb phrases\n'

### Create a chunk parser

In [10]:
chunk_parser = RegexpParser(chunk_patterns)

In [11]:
chunk_parser

<chunk.RegexpParser with 2 stages>

### Perform chunking

In [12]:
result = chunk_parser.parse(pos_tags)

In [13]:
print(result)

(S
  Educative/JJ
  Answers/NNPS
  (VP is/VBZ (NP a/DT free/JJ web/NN))
  (NP encyclopedia/NN)
  written/VBN
  by/IN
  (NP devs/NN)
  for/IN
  (NP devs/NN)
  ./.)
