## Finding words, phrases, names and concepts

In [1]:
from spacy.lang.pt import Portuguese

nlp = Portuguese()
nlp

<spacy.lang.pt.Portuguese at 0x7f4815690a90>

In [2]:
doc = nlp("Olá mundo!")

In [3]:
# Iterate over tokens in a Doc

for token in doc:
    print(token.text)

Olá
mundo
!


In [4]:
doc = nlp("Oi mundo!")

# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

mundo


In [5]:
doc = nlp("Oi mundo!")

# A slice from the Doc is a Span object
span = doc[1:4]

# Get the span text via the .text attribute
print(span.text)

mundo!


In [6]:
# Attribs of a token

doc = nlp("O ingresso custa $50.")

print('Index:   ', [token.i for token in doc])
print('Text:    ', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4, 5]
Text:     ['O', 'ingresso', 'custa', '$', '50', '.']
is_alpha: [True, True, True, False, False, False]
is_punct: [False, False, False, False, False, True]
like_num: [False, False, False, False, True, False]


### Statistical Models

Statistical models enable spaCy to make predictions in context.

In [7]:
import spacy

# Load the small English model
nlp = spacy.load('en_core_web_sm')

# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [8]:
# Load the small Portuguese model
nlp = spacy.load('pt_core_news_sm')

# Process a text
doc = nlp("Ela comeu uma pizza enorme")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

Ela PRON
comeu VERB
uma DET
pizza NOUN
enorme ADJ


### Predicting part of speech tags
For each token in the Doc, we can print the text and the "pos underscore" attribute, the predicted part-of-speech tag.

In spaCy, attributes that return strings usually end with an underscore – attributes without the underscore return an ID.

In [9]:
nlp= spacy.load("en_core_web_sm")

doc = nlp("She ate the pizza")

for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [10]:
nlp = spacy.load('pt_core_news_sm')

doc = nlp("Posso comprar ingresso com cartao de débito")

for token in doc:
    print(token.text, token.pos_)

Posso AUX
comprar VERB
ingresso NOUN
com ADP
cartao NOUN
de ADP
débito NOUN


### Predicting Syntactic Dependencies
In addition to the part-of-speech tags, we can also predict how the words are related. For example, whether a word is the subject of the sentence or an object.

The "dep underscore" attribute returns the predicted dependency label.

The head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to.

In [11]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

Posso AUX aux comprar
comprar VERB ROOT comprar
ingresso NOUN obj comprar
com ADP case cartao
cartao NOUN nmod ingresso
de ADP case débito
débito NOUN nmod cartao


### Predicting Name Entities
Named entities are "real world objects" that are assigned a name – for example, a person, an organization or a country.

The doc dot ents property lets you access the named entities predicted by the model.

It returns an iterator of Span objects, so we can print the entity text and the entity label using the "label underscore" attribute.

In [12]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [13]:
nlp = spacy.load('pt_core_news_sm')

doc = nlp(u"A empresa Apple está buscando comprar uma empresa na Inglaterra por R$1 bilhão de reais")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
Inglaterra LOC
R$ PER


### explain method
A quick tip: To get definitions for the most common tags and labels, you can use the spacy dot explain helper function.

In [14]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

### Predicting Linguistic Annotations

In [15]:
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

doc = nlp(text)

for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print("{:<12}{:<10}{:<10}".format(token_text, token_pos, token_dep))

It          PRON      nsubj     
’s          VERB      punct     
official    NOUN      ccomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [16]:
spacy.explain('dobj')

'direct object'

In [17]:
nlp = spacy.load("pt_core_news_sm")

text = "É oficial: A Apple é a primeira empresa americana a atingir o valor de mercado de 1 bilhão de reais"

doc = nlp(text)

for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print("{:<12}{:<10}{:<10}".format(token_text, token_pos, token_dep))

É           VERB      cop       
oficial     ADJ       ROOT      
:           PUNCT     punct     
A           DET       det       
Apple       PROPN     nsubj     
é           VERB      cop       
a           DET       det       
primeira    ADJ       amod      
empresa     NOUN      ROOT      
americana   ADJ       amod      
a           ADP       mark      
atingir     VERB      acl       
o           DET       det       
valor       NOUN      obj       
de          ADP       case      
mercado     NOUN      nmod      
de          ADP       case      
1           NUM       nummod    
bilhão      NOUN      nmod      
de          ADP       case      
reais       NOUN      nmod      


In [18]:
spacy.explain('cop')

'copula'

In [19]:
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


### Predicting name entities in context

In [20]:
nlp = spacy.load("en_core_web_sm")

text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

doc = nlp(text)

for token in doc.ents:
    print(token.text, token.label_)

iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

New iPhone EVENT
Apple ORG
Missing entity: iPhone X


### Rule based matching
Now we'll take a look at spaCy's matcher, which lets you write rules to find words and phrases in text.

Compared to regular expressions, the matcher works with Doc and Token objects instead of only strings.

It's also more flexible: you can search for texts but also other lexical attributes.

You can even write rules that use the model's predictions.

In [21]:
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{'ORTH': 'iPhone'}, {'ORTH': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)

doc = nlp("New iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

In [22]:
matches

[(9528407286733565721, 1, 3)]

In [23]:
doc = nlp("New iPhone X release date leaked")
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X
