In [1]:
# https://course.spacy.io/en/chapter1
import spacy
from spacy.matcher import Matcher
nlp_s = spacy.load('en_core_web_sm')
nlp = spacy.blank("en")
matcher = Matcher(nlp_s.vocab)

### token properties

In [2]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num and token.i < len(doc) - 1:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


### Part-of-speech (POS) tagging

In [3]:
doc = nlp_s("She ate the Dominos pizza slowly while her dog watched")
metadata = [
    (token.text, token.pos_, token.dep_, token.head.text)
    # get the text
    # the predicted part-of-speech tag (pos_)
    # its role in the sentence (dep_)
    # and the verbs that refer to each noun (or itself if it's a verb)
    for token in doc
]
# Iterate over the tokens
for dat in metadata:
    print(dat)

('She', 'PRON', 'nsubj', 'ate')
('ate', 'VERB', 'ROOT', 'ate')
('the', 'DET', 'det', 'pizza')
('Dominos', 'PROPN', 'compound', 'pizza')
('pizza', 'NOUN', 'dobj', 'ate')
('slowly', 'ADV', 'advmod', 'ate')
('while', 'SCONJ', 'mark', 'watched')
('her', 'PRON', 'poss', 'dog')
('dog', 'NOUN', 'nsubj', 'watched')
('watched', 'VERB', 'advcl', 'ate')


In [4]:
_ = [print((x[0], spacy.explain(x[1]))) for x in metadata]
# spacy.explain describes what a part of speech abbreviation stands for

('She', 'pronoun')
('ate', 'verb')
('the', 'determiner')
('Dominos', 'proper noun')
('pizza', 'noun')
('slowly', 'adverb')
('while', 'subordinating conjunction')
('her', 'pronoun')
('dog', 'noun')
('watched', 'verb')


### Matcher: for assigning labels to recurrent patterns

In [5]:
# using the matcher to match patterns
# the LOWER attr means that the lower-case version of text should match
fifa_pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

matcher.add('FIFA_PAT', [fifa_pattern])

doc = nlp_s("2018 FIFA World Cup: France won!")

matches = matcher(doc)

for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


In [6]:
# using the matcher to match multiple attributes of a token
timer_pattern = [
    {'LEMMA': 'time', 'POS': 'VERB'}, # to avoid time as a noun
    {'POS': 'DET', 'OP': '?'}, # match 0 or 1 instances of a determiner
    {'POS': 'PRON', 'OP': '?'}, # alternately match an optional pronoun
    {'POS': 'NOUN'}
]
# other quantifiers:
# {"OP": "!"}   Negation: match 0 times
# {"OP": "?"}   Optional: match 0 or 1 times
# {"OP": "+"}   Match 1 or more times
# {"OP": "*"}   Match 0 or more times

matcher.add('TIMER_PAT', [timer_pattern])

doc = nlp_s(('She times dogs in her time machine. '
            'I time my oven. '
            'He will time a bird.'))

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

times dogs
time my oven
time a bird
