# Advanced NLP with spaCy
## Intro to spaCy

In [13]:
# Import the English langueage class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


In [14]:
# Import the Spanish language class
from spacy.lang.es import Spanish

# Create the nlp object
nlp = Spanish()

# Process a text (this is Spanish for: "How are you?")
doc = nlp("Hermano ¿Cómo estás?")

# Print the document text
print(doc.text)

Hermano ¿Cómo estás?


In [15]:
# Import the English language class and create the nlp object
from spacy.lang.en import English
nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


In [16]:
# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals '%'
        if next_token.text == '%':
            print('Percentage found:', token.text)

Percentage found: 60
Percentage found: 4


### Named Entity Recognition (NER)

In [17]:
import spacy
# Load the 'en_core_web_sm' model – spaCy is already imported
nlp = spacy.load('en_core_web_sm')

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)
print('-----------------------------------------------------------------------')
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}{:<10}'.format(token_text, token_pos, token_dep,token.head.text))

print('---------------------------------------------------------------------------')

# Iterate over the predicted entities
for ent in doc.ents:
    # print the entity text and its label
    print(ent.text, ent.label_)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value
-----------------------------------------------------------------------
It          PRON      nsubj     ’s        
’s          VERB      ccomp     is        
official    ADJ       acomp     ’s        
:           PUNCT     punct     is        
Apple       PROPN     nsubj     is        
is          AUX       ROOT      is        
the         DET       det       company   
first       ADJ       amod      company   
U.S.        PROPN     nmod      company   
public      ADJ       amod      company   
company     NOUN      attr      is        
to          PART      aux       reach     
reach       VERB      relcl     company   
a           DET       det       value     
$           SYM       quantmod  trillion  
1           NUM       compound  trillion  
trillion    NUM       nummod    value     
market      NOUN      compound  value     
value       NOUN      dobj      reach     
---------------------

In [18]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print('Missing entity:', iphone_x.text)

Apple ORG
Missing entity: iPhone X


In [33]:
print('AUX:',spacy.explain('AUX'),'///','ORG:',spacy.explain('ORG'))

AUX: auxiliary /// ORG: Companies, agencies, institutions, etc.


In [38]:
# Import the Matcher and initialize it with the shared vocabulary
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

# Add the pattern to the matcher
matcher.add('IPHONE_X_PATTERN', [pattern])

# Use the matcher on the doc
matches = matcher(doc)
print('Matches:', [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']
