In [1]:
import spacy

2023-01-15 13:06:56.717446: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-15 13:06:56.717477: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-15 13:06:58.812343: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-01-15 13:06:58.812938: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-01-15 13:06:58.813838: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-15 13:06:58.814299: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties

In [2]:
nlp = spacy.blank("en")

In [3]:
doc = nlp("Hello World!")
for token in doc:
    print(token.text)

Hello
World
!


In [4]:
token = doc[1]
print(token.text)

World


In [5]:
span = doc[1:3]
print(span.text)

World!


In [6]:
# Lexical attributes
doc = nlp("It costs $5.")
print("Index       ", [token.i for token in doc])
print("Text:       ", [token.text for token in doc])
print("is_alpha:   ", [token.is_alpha for token in doc])
print("is_punct:   ", [token.is_punct for token in doc])
print("like_num:   ", [token.like_num for token in doc])

Index        [0, 1, 2, 3, 4]
Text:        ['It', 'costs', '$', '5', '.']
is_alpha:    [True, True, False, False, False]
is_punct:    [False, False, False, False, True]
like_num:    [False, False, False, True, False]


In [7]:
# Lexical attributes
doc = nlp("My email adress is max.mustermann@web.de")
print("Index       ", [token.i for token in doc])
print("Text:       ", [token.text for token in doc])
print("is_alpha:   ", [token.is_alpha for token in doc])
print("is_punct:   ", [token.is_punct for token in doc])
print("like_num:   ", [token.like_num for token in doc])
print("like_email: ", [token.like_email for token in doc])

Index        [0, 1, 2, 3, 4]
Text:        ['My', 'email', 'adress', 'is', 'max.mustermann@web.de']
is_alpha:    [True, True, True, True, False]
is_punct:    [False, False, False, False, False]
like_num:    [False, False, False, False, False]
like_email:  [False, False, False, False, True]


In [8]:
doc = nlp("The inflation rate is 5.34%. That is 2 percentage points higher than last month (3.34%)")
for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == "%":
            print("Percentage found:", token)

Percentage found: 5.34
Percentage found: 3.34


## Trained Pipeline

In [9]:
%%capture
!python -m spacy download en_core_web_sm

In [10]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("She ate the pizza")
for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [11]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [12]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [13]:
print(spacy.explain("GPE"))
print(spacy.explain("NNP"))
print(spacy.explain("ORG"))

Countries, cities, states
noun, proper singular
Companies, agencies, institutions, etc.


In [14]:
print(spacy.explain("det"))

determiner


In [15]:
print(spacy.explain("dobj"))

direct object


In [16]:
print(spacy.explain("intj"))

interjection


In [17]:
print(spacy.explain("amod"))

adjectival modifier


In [18]:
print(spacy.explain("nsubj"))

nominal subject


In [20]:
print(spacy.explain("nmod"))

modifier of nominal


## Rule-based matching

Why not just use Regex?

- match on *Doc* objects , not just strings
- match on tokens and token attributes
- use a models predictions (e.g. find only "duck" only if it is a verb, not a noun)
- more flexible: can search texts and other lexical attributes

Match patterns

- lists of dictionaries (each dictionary describes one token; the dictionary keys are names of token attributes)
- e.g. [{"LEMMA": "buy"}, {"POS": "NOUN"}]

In [22]:
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [23]:
nlp.vocab

<spacy.vocab.Vocab at 0x7f3aa5820af0>

In [29]:
for voc in matcher.vocab:
    print(voc.text)

nuthin
ü.
p.m
Kan
Mar
When's
 
Sept.
c.
Mont.
:-}
12a.m.
e.g
Why's
it
6p.m
Jr.
Who’s
K.
Calif.
e
Ill.
O'clock
o'clock
Mich.
is
:-o
n.
Might
Nov
>.<
he's
it’s
where’s
Wash.
where
:-*
she's
g.
:()
)-:
X
S.C.
Del
Why’s
0.o
must
Goin'
4a.m.
5p.m.
Mass.
co.
━
(-_-)
Ariz
had
0
vs.
x.
><(((*>
11a.m.
-o
When’s
Calif
does
nothin’
’S
Cos
I.e
8-)
Would
do
"
Rev.
’s
N.M.
°c.
b
O.o
might
q.
this's
’’
Goin’
Has
N.J.
pm
ought
Dec
3p.m
<space>
Ore.
10p.m
h.
where's
doin’
What's
he’s
'cos
ä
That's
11a.m
these
1p.m
Tenn
;D
Miss
Ga
Must
9p.m
somethin
Nev.
What
z.
’
there's
'Cause
ü
r
9
Dec.
︵
v.s
Jun.
:-))
;_;
When
D.C.
Have
You
10
Where's
6a.m
o_o
O
'coz
'cause
co
doin
how's
i.e
._.
c’m
Prof.
:’-)
havin'
Co.
Va.
Mt
|
:((
O’clock
Let
Del.
xDD
Prof
i.e.
Ai
lovin'
when’s
Does
Minn
g
12a.m
Who's
-
(
La
11
(;
12p.m.
Mich
n’t
_
Not
1p.m.
Okla.
Okla
what's
□
:P
w
cos
:'-(
Ph.D.
It’s
Va
Nothin’
Ltd.
would
:-/
Rep
Was
;
a.
:)))
k
o.
XD
Ga.
Sha
C
Ought
Sep
>:o
:-)
<3
d
8)
Who
(¬_¬)
Pa.
f
V_V
:(((
need
That’s
]
-8

In [53]:
pattern = [{"LOWER": "iphone"}, {"LOWER": "x"}]
matcher.add("IPHONE_PATTERN", [pattern])

In [54]:
doc = nlp("Upcoming iPhone X release date leaked. Iphone x")
matches = matcher(doc)

In [55]:
matches

[(9528407286733565721, 1, 3), (9528407286733565721, 7, 9)]

In [56]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X
Iphone x


In [37]:
# matching lexical attributes
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
matcher.add("FIFA_PATTERN", [pattern])
doc = nlp("2018 FIFA World Cup: France won!")
matches = matcher(doc)
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)
    

2018 FIFA World Cup:


In [42]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]
doc = nlp("I loved dogs but now I love cats more.")
matcher.add("ANIMAL_LOVE_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start:end])

loved dogs
love cats


In [39]:
spacy.explain("POS")

'possessive ending'

## Operators and Quantifiers

{"OP": "!"} 	Negation: match 0 times 

{"OP": "?"} 	Optional: match 0 or 1 times

{"OP": "+"} 	Match 1 or more times

{"OP": "*"} 	Match 0 or more times

In [44]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("ANIMAL_LOVE_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start:end])

bought a smartphone
buying apps
