# Text Processing Pipelines

In [3]:
import spacy

nlp = spacy.load("en_core_web_lg")

doc = nlp("Hello world!")

for token in doc:
    print(token.text)

Hello
world
!


## Tokenizer

In [5]:
import spacy

nlp = spacy.load("en_core_web_lg")

doc = nlp("""Artificial intelligence (AI) is the intelligence of machines or software, 
as opposed to the intelligence of humans or animals. It is also the field of study in 
computer science that develops and studies intelligent machines. AI may also refer to 
the machines themselves. AI technology is widely used throughout industry, government 
and science. Some high-profile applications are: advanced web search engines (e.g., 
Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), understanding 
human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or 
creative tools (ChatGPT and AI art), and competing at the highest level in strategic 
games (such as chess and Go)""")

for token in doc:
    print(token.text)

Artificial
intelligence
(
AI
)
is
the
intelligence
of
machines
or
software
,


as
opposed
to
the
intelligence
of
humans
or
animals
.
It
is
also
the
field
of
study
in


computer
science
that
develops
and
studies
intelligent
machines
.
AI
may
also
refer
to


the
machines
themselves
.
AI
technology
is
widely
used
throughout
industry
,
government


and
science
.
Some
high
-
profile
applications
are
:
advanced
web
search
engines
(
e.g.
,


Google
Search
)
,
recommendation
systems
(
used
by
YouTube
,
Amazon
,
and
Netflix
)
,
understanding


human
speech
(
such
as
Siri
and
Alexa
)
,
self
-
driving
cars
(
e.g.
,
Waymo
)
,
generative
or


creative
tools
(
ChatGPT
and
AI
art
)
,
and
competing
at
the
highest
level
in
strategic


games
(
such
as
chess
and
Go
)


In [6]:
import spacy

texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_lg")
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 million', 'MONEY')]
[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]


## Lemmatization

Lemmatization is finding the lemma. It's the beginning of intent recognition.

In [7]:
import spacy

nlp = spacy.load("en_core_web_lg")
doc = nlp(u'this product integrates both libraries for downloading and applying patches')

for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


In [8]:
import spacy

# orth is simply an integer that indicates 
# the index of the occurrence of the word that 
# is kept in the spacy. tokens
from spacy.symbols import LOWER, LEMMA

nlp = spacy.load("en_core_web_lg")

nlp.get_pipe("attribute_ruler").add([[{"LOWER": "frisco"}]], {"LEMMA": "San Francisco"})

doc = nlp(u'I am flying to Frisco')

print(['token:%s lemma:%s' % (t.text, t.lemma_) for t in doc])

['token:I lemma:I', 'token:am lemma:be', 'token:flying lemma:fly', 'token:to lemma:to', 'token:Frisco lemma:San Francisco']


## Tagger

PoS (Part of Speech) tagger.

In [9]:
import spacy

# orth is simply an integer that indicates 
# the index of the occurrence of the word that 
# is kept in the spacy. tokens
from spacy.symbols import LOWER, LEMMA

nlp = spacy.load("en_core_web_lg")

nlp.get_pipe("attribute_ruler").add([[{"LOWER": "frisco"}]], {"LEMMA": "San Francisco"})

doc = nlp(u'I have flown to LA. Now I am flying to Frisco')

for t in doc:
    print('token:%s lemma:%s pos:%s tag:%s' % (t.text, t.lemma_, t.pos_, t.tag_))

token:I lemma:I pos:PRON tag:PRP
token:have lemma:have pos:AUX tag:VBP
token:flown lemma:fly pos:VERB tag:VBN
token:to lemma:to pos:ADP tag:IN
token:LA lemma:LA pos:PROPN tag:NNP
token:. lemma:. pos:PUNCT tag:.
token:Now lemma:now pos:ADV tag:RB
token:I lemma:I pos:PRON tag:PRP
token:am lemma:be pos:AUX tag:VBP
token:flying lemma:fly pos:VERB tag:VBG
token:to lemma:to pos:ADP tag:IN
token:Frisco lemma:San Francisco pos:PROPN tag:NNP


In [12]:
import spacy
print(spacy.explain("PRON"))
print(spacy.explain("PRP"))
print(spacy.explain("AUX"))
print(spacy.explain("VBP"))
print(spacy.explain("VBN"))
print(spacy.explain("ADP"))
print(spacy.explain("IN"))
print(spacy.explain("PROPN"))
print(spacy.explain("NNP"))
print(spacy.explain("PUNCT"))
print(spacy.explain("ADV"))
print(spacy.explain("RB"))
print(spacy.explain("VERB"))
print(spacy.explain("VBG"))

pronoun
pronoun, personal
auxiliary
verb, non-3rd person singular present
verb, past participle
adposition
conjunction, subordinating or preposition
proper noun
noun, proper singular
punctuation
adverb
adverb
verb
verb, gerund or present participle


## Dependepncy Parser

In [13]:
import spacy

# orth is simply an integer that indicates 
# the index of the occurrence of the word that 
# is kept in the spacy. tokens
from spacy.symbols import LOWER, LEMMA

nlp = spacy.load("en_core_web_lg")

nlp.get_pipe("attribute_ruler").add([[{"LOWER": "frisco"}]], {"LEMMA": "San Francisco"})

doc = nlp(u'I have flown to LA. Now I am flying to Frisco')

for t in doc:
    print('token:%s lemma:%s pos:%s tag:%s' % (t.text, t.lemma_, t.pos_, t.tag_))

for t in doc:
    print('token:%s lemma:%s pos:%s dep:%s' % (t.text, t.lemma_, t.pos_, t.dep_))

for t in doc:
    print('token head text:%s dependency:%s text:%s' % (t.head.text, t.dep_, t.text))

for sent in doc.sents:
    print([w.text for w in sent if w.dep_ == 'ROOT' or w.dep_ == 'pobj'])

token:I lemma:I pos:PRON tag:PRP
token:have lemma:have pos:AUX tag:VBP
token:flown lemma:fly pos:VERB tag:VBN
token:to lemma:to pos:ADP tag:IN
token:LA lemma:LA pos:PROPN tag:NNP
token:. lemma:. pos:PUNCT tag:.
token:Now lemma:now pos:ADV tag:RB
token:I lemma:I pos:PRON tag:PRP
token:am lemma:be pos:AUX tag:VBP
token:flying lemma:fly pos:VERB tag:VBG
token:to lemma:to pos:ADP tag:IN
token:Frisco lemma:San Francisco pos:PROPN tag:NNP
token:I lemma:I pos:PRON dep:nsubj
token:have lemma:have pos:AUX dep:aux
token:flown lemma:fly pos:VERB dep:ROOT
token:to lemma:to pos:ADP dep:prep
token:LA lemma:LA pos:PROPN dep:pobj
token:. lemma:. pos:PUNCT dep:punct
token:Now lemma:now pos:ADV dep:advmod
token:I lemma:I pos:PRON dep:nsubj
token:am lemma:be pos:AUX dep:aux
token:flying lemma:fly pos:VERB dep:ROOT
token:to lemma:to pos:ADP dep:prep
token:Frisco lemma:San Francisco pos:PROPN dep:pobj
token head text:flown dependency:nsubj text:I
token head text:flown dependency:aux text:have
token head te

## NER (Named Entity Recognition)

In [14]:
import spacy

# orth is simply an integer that indicates 
# the index of the occurrence of the word that 
# is kept in the spacy. tokens
from spacy.symbols import LOWER, LEMMA

nlp = spacy.load("en_core_web_lg")

nlp.get_pipe("attribute_ruler").add([[{"LOWER": "frisco"}]], {"LEMMA": "San Francisco"})

doc = nlp(u'I have flown to LA. Now I am flying to Frisco')

for token in doc:
    if token.ent_type != 0:
        print(token.lemma_, token.ent_type_)

LA GPE
San Francisco GPE
