# POS Tagging NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize

In [2]:
sentence = 'The series depicts the story of three poor sisters who are close, their involvement in the case of 70 billion won which goes missing'

In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [9]:
token = word_tokenize(sentence)
token

['The',
 'series',
 'depicts',
 'the',
 'story',
 'of',
 'three',
 'poor',
 'sisters',
 'who',
 'are',
 'close',
 ',',
 'their',
 'involvement',
 'in',
 'the',
 'case',
 'of',
 '70',
 'billion',
 'won',
 'which',
 'goes',
 'missing']

In [10]:
tagged = nltk.pos_tag(token)
tagged

[('The', 'DT'),
 ('series', 'NN'),
 ('depicts', 'VBZ'),
 ('the', 'DT'),
 ('story', 'NN'),
 ('of', 'IN'),
 ('three', 'CD'),
 ('poor', 'JJ'),
 ('sisters', 'NNS'),
 ('who', 'WP'),
 ('are', 'VBP'),
 ('close', 'RB'),
 (',', ','),
 ('their', 'PRP$'),
 ('involvement', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('case', 'NN'),
 ('of', 'IN'),
 ('70', 'CD'),
 ('billion', 'CD'),
 ('won', 'VBD'),
 ('which', 'WDT'),
 ('goes', 'VBZ'),
 ('missing', 'VBG')]

In [11]:
grammar = ('''
    NP: {<DT>?<JJ>*<NN>}
''')

In [12]:
chunkParser = nltk.RegexpParser(grammar)
tree = chunkParser.parse(tagged)
for subtree in tree.subtrees():
    print(subtree)

(S
  (NP The/DT series/NN)
  depicts/VBZ
  (NP the/DT story/NN)
  of/IN
  three/CD
  poor/JJ
  sisters/NNS
  who/WP
  are/VBP
  close/RB
  ,/,
  their/PRP$
  (NP involvement/NN)
  in/IN
  (NP the/DT case/NN)
  of/IN
  70/CD
  billion/CD
  won/VBD
  which/WDT
  goes/VBZ
  missing/VBG)
(NP The/DT series/NN)
(NP the/DT story/NN)
(NP involvement/NN)
(NP the/DT case/NN)


## POS Tagging spaCy

In [16]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
!python -m spacy download en

2022-10-17 06:22:00.811691: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 14.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
import spacy

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
sentence = nlp("The series depicts the story of three poor sisters who are close, their involvement in the case of 70 billion won which goes missing")

In [22]:
for token in sentence:
    print(token.text, token.pos_, token.tag_)

The DET DT
series NOUN NN
depicts VERB VBZ
the DET DT
story NOUN NN
of ADP IN
three NUM CD
poor ADJ JJ
sisters NOUN NNS
who PRON WP
are AUX VBP
close ADJ JJ
, PUNCT ,
their PRON PRP$
involvement NOUN NN
in ADP IN
the DET DT
case NOUN NN
of ADP IN
70 NUM CD
billion NUM CD
won NOUN NN
which PRON WDT
goes VERB VBZ
missing VERB VBG


In [24]:
for chunk in sentence.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

The series NP series
the story NP story
three poor sisters NP sisters
who NP who
their involvement NP involvement
the case NP case
70 billion won NP won
which NP which


In [26]:
#Dependency Parsing
for token in sentence:
    print("{0}/{1} <-- {2} <-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_
    ))

The/DT <-- det <-- series/NN
series/NN <-- nsubj <-- depicts/VBZ
depicts/VBZ <-- ROOT <-- depicts/VBZ
the/DT <-- det <-- story/NN
story/NN <-- dobj <-- depicts/VBZ
of/IN <-- prep <-- story/NN
three/CD <-- nummod <-- sisters/NNS
poor/JJ <-- amod <-- sisters/NNS
sisters/NNS <-- pobj <-- of/IN
who/WP <-- nsubj <-- are/VBP
are/VBP <-- relcl <-- sisters/NNS
close/JJ <-- acomp <-- are/VBP
,/, <-- punct <-- depicts/VBZ
their/PRP$ <-- poss <-- involvement/NN
involvement/NN <-- nsubj <-- missing/VBG
in/IN <-- prep <-- involvement/NN
the/DT <-- det <-- case/NN
case/NN <-- pobj <-- in/IN
of/IN <-- prep <-- case/NN
70/CD <-- compound <-- billion/CD
billion/CD <-- nummod <-- won/NN
won/NN <-- pobj <-- of/IN
which/WDT <-- nsubj <-- goes/VBZ
goes/VBZ <-- relcl <-- won/NN
missing/VBG <-- advcl <-- depicts/VBZ


In [28]:
#Dependecy Parsing Visualization
from spacy import displacy

displacy.render(sentence, style='dep', jupyter=True, options={'distance':85})