# Intro

In [61]:
# Import the English language class
from spacy.lang.en import English
# Create the nlp object
nlp = English()

In [66]:
nlp.meta

{'lang': 'en',
 'name': 'core_web_sm',
 'version': '3.4.1',
 'description': 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
 'author': 'Explosion',
 'email': 'contact@explosion.ai',
 'url': 'https://explosion.ai',
 'license': 'MIT',
 'spacy_version': '>=3.4.0,<3.5.0',
 'spacy_git_version': '2b5f955c2',
 'vectors': {'width': 0,
  'vectors': 0,
  'keys': 0,
  'name': None,
  'mode': 'default'},
 'labels': {'tok2vec': [],
  'tagger': ['$',
   "''",
   ',',
   '-LRB-',
   '-RRB-',
   '.',
   ':',
   'ADD',
   'AFX',
   'CC',
   'CD',
   'DT',
   'EX',
   'FW',
   'HYPH',
   'IN',
   'JJ',
   'JJR',
   'JJS',
   'LS',
   'MD',
   'NFP',
   'NN',
   'NNP',
   'NNPS',
   'NNS',
   'PDT',
   'POS',
   'PRP',
   'PRP$',
   'RB',
   'RBR',
   'RBS',
   'RP',
   'SYM',
   'TO',
   'UH',
   'VB',
   'VBD',
   'VBG',
   'VBN',
   'VBP',
   'VBZ',
   'WDT',
   'WP',
   'WP$',
   'WRB',
   'XX',
   '_SP',
   '``'],
  'parser': ['RO

In [3]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")
# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


In [4]:
doc = nlp("Hello world!")
# Index into the Doc to get a single Token
token = doc[1]
# Get the token text via the .text attribute
print(token.text)

world


In [8]:
doc = nlp("Hello world!")
# A slice from the Doc is a Span object
span = doc[1:3]
# Get the span text via the .text attribute
print(span.text)

world!


In [9]:
doc = nlp("It costs $5.")
print('Index: ', [token.i for token in doc])
print('Text: ', [token.text for token in doc])
print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


In [10]:
# Import the English language class
from spacy.lang.en import English
# Create the nlp object
nlp = English()
# Process a text
doc = nlp("This is a sentence.")
# Print the document text
print(doc.text)

This is a sentence.


In [11]:
# Import the German language class
from spacy.lang.de import German
# Create the nlp object
nlp = German()
# Process a text (this is German for: "Kind regards!")
doc = nlp("Liebe Grüße!")
# Print the document text
print(doc.text)

Liebe Grüße!


In [12]:
# Import the Spanish language class
from spacy.lang.es import Spanish
# Create the nlp object
nlp = Spanish()
# Process a text (this is Spanish for: "How are you?")
doc = nlp("¿Cómo estás?")
# Print the document text
print(doc.text)

¿Cómo estás?


In [16]:
# Import the English language class and create the nlp object
from spacy.lang.en import English
nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# Select the first token
first_token = doc[0]

# Print the first token's text
print(first_token.text)

I


In [17]:
# Import the English language class and create the nlp object
from spacy.lang.en import English
nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


In [15]:
# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals '%'
        if next_token.text == '%':
            print('Percentage found:', token.text)

Percentage found: 60
Percentage found: 4


# Model Packages

* Binary weights that enable spaCy to make predictions.
* Vocabulary
* Meta information (language, pipeline) to tell spaCy which language class to use and how to configure the processing pipeline.

In [67]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [68]:
nlp.meta

{'lang': 'en',
 'name': 'core_web_sm',
 'version': '3.4.1',
 'description': 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
 'author': 'Explosion',
 'email': 'contact@explosion.ai',
 'url': 'https://explosion.ai',
 'license': 'MIT',
 'spacy_version': '>=3.4.0,<3.5.0',
 'spacy_git_version': '2b5f955c2',
 'vectors': {'width': 0,
  'vectors': 0,
  'keys': 0,
  'name': None,
  'mode': 'default'},
 'labels': {'tok2vec': [],
  'tagger': ['$',
   "''",
   ',',
   '-LRB-',
   '-RRB-',
   '.',
   ':',
   'ADD',
   'AFX',
   'CC',
   'CD',
   'DT',
   'EX',
   'FW',
   'HYPH',
   'IN',
   'JJ',
   'JJR',
   'JJS',
   'LS',
   'MD',
   'NFP',
   'NN',
   'NNP',
   'NNPS',
   'NNS',
   'PDT',
   'POS',
   'PRP',
   'PRP$',
   'RB',
   'RBR',
   'RBS',
   'RP',
   'SYM',
   'TO',
   'UH',
   'VB',
   'VBD',
   'VBG',
   'VBN',
   'VBP',
   'VBZ',
   'WDT',
   'WP',
   'WP$',
   'WRB',
   'XX',
   '_SP',
   '``'],
  'parser': ['RO

# Predicting Part-of-speech Tags

In [113]:
import spacy
# Load the small English model
nlp = spacy.load('en_core_web_sm')
# Process a text
doc = nlp("She ate the pizza")
# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, f'{token.pos_, spacy.explain(token.pos_).upper()}')

She ('PRON', 'PRONOUN')
ate ('VERB', 'VERB')
the ('DET', 'DETERMINER')
pizza ('NOUN', 'NOUN')


# Predicting Syntactic Dependencies

In [47]:
for token in doc:
    print(token.text, f'{token.pos_, spacy.explain(token.pos_).upper()}', f'{token.dep_, spacy.explain(token.dep_).upper()}', token.head.text)

She ('PRON', 'PRONOUN') ('nsubj', 'NOMINAL SUBJECT') ate
ate ('VERB', 'VERB') ('ROOT', 'ROOT') ate
the ('DET', 'DETERMINER') ('det', 'DETERMINER') pizza
pizza ('NOUN', 'NOUN') ('dobj', 'DIRECT OBJECT') ate


# Predicting Named Entities

In [48]:
# Process a text
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, f'[{ent.label_}]', ':', spacy.explain(ent.label_) )

Apple [ORG] : Companies, agencies, institutions, etc.
U.K. [GPE] : Countries, cities, states
$1 billion [MONEY] : Monetary values, including unit


# Missing Entities with a token span

In [70]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print('Missing entity:', iphone_x.text)# 

Apple ORG
Missing entity: iPhone X


# Rule-base matching

## Using the Matcher

In [95]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher
# Load a model and create the nlp object
nlp = spacy.load('en_core_web_sm')
# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)
# Add the pattern to the matcher
pattern = [{'ORTH': 'iPhone'}, {'ORTH': 'X'}]
matcher.add('IPHONE_PATTERN', [pattern])
# Process some text
doc = nlp("New iPhone X release date leaked")
# Call the matcher on the doc
matches = matcher(doc)

In [96]:
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(f'id {match_id}:', matched_span.text)

id 9528407286733565721: iPhone X


* match_id : hash value of the pattern name
* start : start index of matched span
* end : end index of matched span

## Matching lexical attributes

In [97]:
pattern = [{'IS_DIGIT': True},
           {'LOWER': 'fifa'},
           {'LOWER': 'world'},
           {'LOWER': 'cup'},
           {'IS_PUNCT': True}]
doc = nlp("2018 FIFA World Cup: France won!")

matcher = Matcher(nlp.vocab)
matcher.add('FIFA_PATTERN', [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(f'id {match_id}:', matched_span.text)

id 851579294197118795: 2018 FIFA World Cup:


## Matching other token attributes

In [99]:
pattern = [{'LEMMA': 'love', 'POS': 'VERB'},
           {'POS': 'NOUN'}]
doc = nlp("I loved dogs but now I love cats more.")

matcher = Matcher(nlp.vocab)
matcher.add('PETS', [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(f'id {match_id}:', matched_span.text)

id 17032343184651558613: loved dogs
id 17032343184651558613: love cats


## Using operators ("OP" key) and quantifiers

    {'OP': '!'} Negation: match 0 times
    {'OP': '?'} Optional: match 0 or 1 times
    {'OP': '+'} Match 1 or more times
    {'OP': '*'} Match 0 or more times

In [100]:
pattern = [{'LEMMA': 'buy'},
           {'POS': 'DET', 'OP': '?'}, # optional: match 0 or 1 times
           {'POS': 'NOUN'}]
doc = nlp("I bought a smartphone. Now I'm buying apps.")

matcher = Matcher(nlp.vocab)
matcher.add('OPERATORS', [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(f'id {match_id}:', matched_span.text)

id 225095504979660923: bought a smartphone
id 225095504979660923: buying apps


In [102]:
doc = nlp('New iPhone X release date leaked as Apple reveals pre-orders by mistake')
matcher = Matcher(nlp.vocab)
# Add the pattern to the matcher
pattern = [{'ORTH': 'iPhone'}, {'ORTH': 'X'}]
matcher.add('IPHONE_PATTERN', [pattern])
# Process some text
doc = nlp("New iPhone X release date leaked")
# Call the matcher on the doc
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(f'id {match_id}:', matched_span.text)

id 9528407286733565721: iPhone X


In [106]:
[doc[start:end].text for match_id, start, end in matches]

['iPhone X']

## Writing more complex match patterns

Write one pattern that only matches mentions of the full iOS versions: "iOS 7", "iOS 11" and "iOS 10".

In [109]:
doc = nlp("After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper.")

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{'TEXT': 'iOS'}, {'IS_DIGIT': True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('IOS_VERSION_PATTERN', [pattern])
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


Write one pattern that only matches forms of "download" (tokens with the lemma "download"), followed by a token with the part-of-speech tag 'PROPN' (proper noun).

In [114]:
doc = nlp("i downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was downloading Minecraft, I got the Windows version where it is the '.zip' folder and I used the default program to unpack it... do I also need to download Winzip?")

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{'LEMMA': 'download'}, {'POS': 'PROPN'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('DOWNLOAD_THINGS_PATTERN', [pattern])
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


Write one pattern that matches adjectives ('ADJ') followed by one or two 'NOUN's (one noun and one optional noun).

In [116]:
doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{'POS': 'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN', 'OP': '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('ADJ_NOUN_PATTERN', [pattern])
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses
