# Advanced Preprocessing in NLP

## Part-of-Speech Tagging, Named Entity Recognition and Parsing

In [1]:
# checking spacy's version
!python -m spacy info

[1m

spaCy version    3.7.6                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-6.1.85+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.7.1)        



In [2]:
# importing the spacy library
import spacy

In [3]:
# loading the spacy model
nlp = spacy.load('en_core_web_sm')

In [4]:
# sample sentence
corpus = "Energy cannot be created or destroyed, it can only be changed from one form to another. - Albert Einstein"

# tokenizing to return a doc object
doc = nlp(corpus)

## Part of Speech tagging (POS)

Part of speech tagging categorizes each word to detect if it is a noun, verb or other category in the part of speech.

In [5]:
# pos tagging using the pos_ atrribute
print([(t.text, t.pos_) for t in doc])

[('Energy', 'NOUN'), ('can', 'AUX'), ('not', 'PART'), ('be', 'AUX'), ('created', 'VERB'), ('or', 'CCONJ'), ('destroyed', 'VERB'), (',', 'PUNCT'), ('it', 'PRON'), ('can', 'AUX'), ('only', 'ADV'), ('be', 'AUX'), ('changed', 'VERB'), ('from', 'ADP'), ('one', 'NUM'), ('form', 'NOUN'), ('to', 'ADP'), ('another', 'PRON'), ('.', 'PUNCT'), ('-', 'PUNCT'), ('Albert', 'PROPN'), ('Einstein', 'PROPN')]


In [6]:
# the tag_ attribute provides more details such as tense, if a word is pronoun
print([(t.text, t.tag_) for t in doc])

[('Energy', 'NN'), ('can', 'MD'), ('not', 'RB'), ('be', 'VB'), ('created', 'VBN'), ('or', 'CC'), ('destroyed', 'VBN'), (',', ','), ('it', 'PRP'), ('can', 'MD'), ('only', 'RB'), ('be', 'VB'), ('changed', 'VBN'), ('from', 'IN'), ('one', 'CD'), ('form', 'NN'), ('to', 'IN'), ('another', 'DT'), ('.', '.'), ('-', ':'), ('Albert', 'NNP'), ('Einstein', 'NNP')]


## Named Entity Recognition (NER)

NER detects and categorizes important information in the text known as named entities.

In [7]:
# one way of using NER is through the ent.label_ attribute with doc.ents
print([(ent.text, ent.label_) for ent in doc.ents])

[('one', 'CARDINAL'), ('Albert Einstein', 'PERSON')]


## Parsing

To parse means to break down a sentence into its component parts so that the meaning of the sentence can be understood.

In [8]:
# using visualization to view dependency parse
from spacy import displacy

s = "Deadpool is from the Marvel Universe."
doc = nlp(s)

# style is dep to view dependency, while jupyter is set to True or output will be a html
displacy.render(doc, style='dep', jupyter=True)

In [9]:
# the dep_ attribute can be used to view dependency labels
# head of each dependency is printed to understand how they connect with each other
print([(t.text, t.dep_, t.head.text) for t in doc])

[('Deadpool', 'nsubj', 'is'), ('is', 'ROOT', 'is'), ('from', 'prep', 'is'), ('the', 'det', 'Universe'), ('Marvel', 'compound', 'Universe'), ('Universe', 'pobj', 'from'), ('.', 'punct', 'is')]


In [10]:
# use spacy.explain to get information on a particular annotation.
spacy.explain('nsubj')

'nominal subject'