<a href="https://colab.research.google.com/github/nshantha/NLP/blob/main/POS_Tagging_based_on_Heuristics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import necessary Libraries

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tree import Tree

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## A sentence under consideration for Information Extraction (NER)

In [None]:
sentence = 'Virat Kohli is an Indian cricketer who currently captains the India national team. A right-handed top-order batsman, Kohli is regarded as one of the best batsmen in the world'

## Apply word tokenization and part-of-speech tagging to the sentence

In [None]:
def preprocess(sent):
    sent =  word_tokenize(sentence)
    sent = nltk.pos_tag(sent)
    return sent

In [None]:
sent = preprocess(sentence)
sent

[('Virat', 'NNP'),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('Indian', 'JJ'),
 ('cricketer', 'NN'),
 ('who', 'WP'),
 ('currently', 'RB'),
 ('captains', 'VBZ'),
 ('the', 'DT'),
 ('India', 'NNP'),
 ('national', 'JJ'),
 ('team', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('right-handed', 'JJ'),
 ('top-order', 'NN'),
 ('batsman', 'NN'),
 (',', ','),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('regarded', 'VBN'),
 ('as', 'IN'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('batsmen', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('world', 'NN')]

## Plot a Parse Tree

In [None]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
NPChunker =  nltk.RegexpParser(pattern) 
result = NPChunker.parse(sent)
Tree.fromstring(str(result)).pretty_print()


                                                                                                                                  S                                                                                                                                                          
     _____________________________________________________________________________________________________________________________|_______________________________________________________________________________________________________________________________________________            
    |         |       |      |         |            |         |        |      |   |      |       |         |         |     |      |     |       |          |        |             NP                              NP                     NP                        NP             NP         
    |         |       |      |         |            |         |        |      |   |      |       |         |         |     |      |     |    

## POS Tags

In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(sent)
pprint(iob_tagged)

[('Virat', 'NNP', 'O'),
 ('Kohli', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'O'),
 ('Indian', 'JJ', 'O'),
 ('cricketer', 'NN', 'O'),
 ('who', 'WP', 'O'),
 ('currently', 'RB', 'O'),
 ('captains', 'VBZ', 'O'),
 ('the', 'DT', 'O'),
 ('India', 'NNP', 'O'),
 ('national', 'JJ', 'O'),
 ('team', 'NN', 'O'),
 ('.', '.', 'O'),
 ('A', 'DT', 'O'),
 ('right-handed', 'JJ', 'O'),
 ('top-order', 'NN', 'O'),
 ('batsman', 'NN', 'O'),
 (',', ',', 'O'),
 ('Kohli', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('regarded', 'VBN', 'O'),
 ('as', 'IN', 'O'),
 ('one', 'CD', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('best', 'JJS', 'O'),
 ('batsmen', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('world', 'NN', 'O')]
