In [1]:
from spacy.lang.en import English

In [2]:
nlp = English()

In [3]:
doc = nlp("Hello, my name is Tiffany!")

In [4]:
for token in doc:
    print(token.text)

Hello
,
my
name
is
Tiffany
!


In [5]:
name = doc[5]
name

Tiffany

In [6]:
my_name = doc[2:4]
my_name

my name

In [7]:
print(my_name.text)

my name


In [8]:
print("Index:  ", [token.i for token in doc])

Index:   [0, 1, 2, 3, 4, 5, 6]


In [9]:
print("Text:  ", [token.text for token in doc])

Text:   ['Hello', ',', 'my', 'name', 'is', 'Tiffany', '!']


In [10]:
print("is_alpha:  ", [token.is_alpha for token in doc])

is_alpha:   [True, False, True, True, True, True, False]


In [11]:
print("is_punct:  ", [token.is_punct for token in doc])

is_punct:   [False, True, False, False, False, False, True]


In [12]:
print("like_num:  ", [token.like_num for token in doc])

like_num:   [False, False, False, False, False, False, False]


In [13]:
doc = nlp("This is a sentence.")
print(doc.text)

This is a sentence.


In [14]:
doc = nlp("I like tree kangaroos and narwhals.")
print(doc.text)

I like tree kangaroos and narwhals.


In [15]:
first_token = doc[0].text
first_token

'I'

In [16]:
for i in range(2,6):
    print(doc[i].text)

tree
kangaroos
and
narwhals


In [17]:
without_loop = doc[2:6].text
without_loop

'tree kangaroos and narwhals'

In [18]:
doc = nlp( "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.")
doc[0].text

'In'

In [19]:
#if token is like_num, look behind it to check for % symbol
for token in doc:
    if token.like_num:
        next_token =doc[token.i +1] 
        if next_token.text =="%":  
            print('Percentage found:  ',token.text, next_token)

Percentage found:   60 %
Percentage found:   4 %


In [20]:
import spacy
nlp = spacy.load("en_core_web_sm") #small model package


In [21]:
doc = nlp("She ate the pizza")

In [22]:
for token in doc:
    print(token.text, token.pos_) #part of speech

She PRON
ate VERB
the DET
pizza NOUN


In [23]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text) #dep dependencies 
                                                                                        #head is parent token it's attached to

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [24]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [25]:
#iterate over the predicited entities
#print entity text and its label
for ent in doc.ents:
    print(ent.text, ent.label_) 

Apple ORG
U.K. GPE
$1 billion MONEY


In [26]:
#use spacy explain for quick definitions
spacy.explain("GPE")


'Countries, cities, states'

In [27]:
spacy.explain("NNP")

'noun, proper singular'

In [28]:
spacy.explain("nsubj")

'nominal subject'

In [29]:
text = "It's official:  Apple is the first U.S public company to reach a $1 trillion market value"

In [30]:
doc = nlp(text)
doc

It's official:  Apple is the first U.S public company to reach a $1 trillion market value

In [31]:
doc[1]

's

In [32]:
doc.text

"It's official:  Apple is the first U.S public company to reach a $1 trillion market value"

In [33]:
for token in doc:
    print(token.text)

It
's
official
:
 
Apple
is
the
first
U.S
public
company
to
reach
a
$
1
trillion
market
value


In [34]:
doc[3].text

':'

In [35]:
doc[2].text

'official'

In [36]:
for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print(f"{token_text: <15} {token_pos: <12} {token_dep: <15}")

It              PRON         nsubj          
's              AUX          ROOT           
official        ADJ          acomp          
:               PUNCT        punct          
                SPACE        attr           
Apple           PROPN        nsubj          
is              AUX          ROOT           
the             DET          det            
first           ADJ          amod           
U.S             PROPN        nmod           
public          ADJ          amod           
company         NOUN         attr           
to              PART         aux            
reach           VERB         relcl          
a               DET          det            
$               SYM          quantmod       
1               NUM          compound       
trillion        NUM          nummod         
market          NOUN         compound       
value           NOUN         dobj           


In [37]:
spacy.explain("AUX")

'auxiliary'

In [38]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S GPE
$1 trillion MONEY


In [39]:
spacy.explain("GPE")

'Countries, cities, states'

In [40]:
text =  [token.text for token in doc]
text

['It',
 "'s",
 'official',
 ':',
 ' ',
 'Apple',
 'is',
 'the',
 'first',
 'U.S',
 'public',
 'company',
 'to',
 'reach',
 'a',
 '$',
 '1',
 'trillion',
 'market',
 'value']

In [41]:
ent = [ent.text for ent in doc.ents]
ent

['Apple', 'first', 'U.S', '$1 trillion']

In [42]:
ent_labels =  [ent.label_ for ent in doc.ents]
ent_labels

['ORG', 'ORDINAL', 'GPE', 'MONEY']

In [43]:
dict((ent.text, ent.label_) for ent in doc.ents)
    

{'Apple': 'ORG', 'first': 'ORDINAL', 'U.S': 'GPE', '$1 trillion': 'MONEY'}

In [44]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

In [45]:
doc = nlp(text)
doc.text

'Upcoming iPhone X release date leaked as Apple reveals pre-orders'

In [46]:
text_list = [token.text for token in doc]

In [47]:
text_list

['Upcoming',
 'iPhone',
 'X',
 'release',
 'date',
 'leaked',
 'as',
 'Apple',
 'reveals',
 'pre',
 '-',
 'orders']

In [48]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG


In [49]:
#span for "iPhone X"
iphone_x = doc[1:3]
iphone_x

iPhone X

In [50]:
iphone_x.text

'iPhone X'

In [51]:
[{"Text": "iphone"}, {"Text": "X"}]

[{'Text': 'iphone'}, {'Text': 'X'}]

The Matcher lets you find words and phrases using rules describing their token attributes. 

Rules can refer to token annotations (like the text or part-of-speech tags), as well as lexical attributes like Token.is_punct. 

Applying the matcher to a Doc gives you access to the matched tokens in context. 

In [52]:
from spacy.matcher import Matcher


In [53]:
# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

In [54]:
doc = nlp("Upcoming iPhone X release date leaked as Apple")

In [55]:
# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

In [56]:
matcher.add("Iphone_x_pattern",[pattern])

In [57]:
# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [58]:
doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

In [59]:
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

In [60]:
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTER",[pattern])

In [61]:
matches = matcher(doc)
print("Total matches found:", len(matches))

Total matches found: 3


In [62]:
#match_id, start_location and end_location in doc
matches

[(16864847348423454655, 24, 26),
 (16864847348423454655, 29, 31),
 (16864847348423454655, 38, 40)]

In [63]:
# Iterate over the matches and print the span text
for match_id, start,end in matches:
    print("Match found: ", doc[start:end].text)

Match found:  iOS 7
Match found:  iOS 11
Match found:  iOS 10


In [64]:
doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

In [65]:
# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

In [66]:
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])

In [67]:
matches = matcher(doc)
print("Total matches found: ", len(matches))

Total matches found:  3


In [68]:
for match_id, start, end in matches:
    print("Match found: ", doc[start:end].text)

Match found:  downloaded Fortnite
Match found:  downloading Minecraft
Match found:  download Winzip


In [69]:
doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

In [70]:
# Write a pattern for adjective plus one or two nouns
pattern = [{"POS":"ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

In [71]:
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))


Total matches found: 5


In [72]:
for match_id, start, end in matches:
    print("Match found: ", doc[start:end].text)

Match found:  beautiful design
Match found:  smart search
Match found:  automatic labels
Match found:  optional voice
Match found:  optional voice responses


In [73]:
nlp = English()
doc = nlp("I have a cat")

In [74]:
for token in doc:
    print(token.text)

I
have
a
cat


In [75]:
# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings['cat']

In [76]:
print(cat_hash)

5439657043933447811


In [77]:
# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

cat


In [78]:
nlp = English()
doc = nlp("David Bowie is a PERSON")
doc

David Bowie is a PERSON

In [79]:
person_hash = nlp.vocab.strings['PERSON']
print(person_hash)

380


In [80]:
#upper and lower have different hash
person_hash2 = nlp.vocab.strings['person']
print(person_hash2)

14800503047316267216


In [81]:
person_string = nlp.vocab.strings[person_hash]
print(person_string)

PERSON


In [82]:
person_string2 = nlp.vocab.strings[person_hash2]
print(person_string2)

person


In [83]:
from spacy.lang.en import English
from spacy.lang.de import German

In [84]:
# Create an English and German nlp object
nlp = English()
nlp_de = German()

In [85]:
# Get the ID for the string 'Bowie'
bowie_id = nlp.vocab.strings["Bowie"]
print(bowie_id)

2644858412616767388


In [86]:
from spacy.tokens import Doc

In [87]:
#The words and spaces to create the doc from
#spaces indicate whether the word is followed by a space
words = ['Hello', 'world', '!']
spaces = [True, False, False]

In [88]:
#create doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [89]:
doc

Hello world!

In [90]:
#span is slice of doc consist of 1 or more tokens
#span(doc, start, end)
from spacy.tokens import Span

In [91]:
span = Span(doc, 0,2)
span.text

'Hello world'

In [92]:
#create span with label
span_with_label = Span(doc, 0,2, label ='GREETING')
span_with_label

Hello world

In [93]:
span_with_label.label_

'GREETING'

In [94]:
#Add span to doc.ents
doc.ents = [span_with_label]

In [95]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Hello world GREETING


In [96]:
for token in doc:
    print(token.text)

Hello
world
!


In [97]:
indices = [token.i for token in doc]
indices

[0, 1, 2]

In [98]:
token_text =[token.text for token in doc]
token_text

['Hello', 'world', '!']

In [99]:
dict = {}
for (key, value) in enumerate(token_text):
    dict[key] = value

In [100]:
dict

{0: 'Hello', 1: 'world', 2: '!'}

In [101]:
token_dictionary = [(token.text,token.i) for token in doc]
token_dictionary

[('Hello', 0), ('world', 1), ('!', 2)]

In [102]:
# Desired text: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

In [103]:
# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


In [104]:
doc_text = [doc.text for text in doc]
doc_text

['spaCy is cool!', 'spaCy is cool!', 'spaCy is cool!', 'spaCy is cool!']

In [105]:
doc_text = [token.text for token in doc]
doc_text

['spaCy', 'is', 'cool', '!']

In [106]:
doc_dict = [(token.text, token.pos_) for token in doc]
doc_dict

[('spaCy', ''), ('is', ''), ('cool', ''), ('!', '')]

In [107]:
entities = [ent.text for ent in doc.ents]
entities

[]

In [108]:
# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
spaces =[False,True, True,False, False]

In [109]:
doc = Doc(nlp.vocab, words =words, spaces = spaces)
doc

Go, get started!

In [110]:
 #Desired text: "Oh, really?!"
words = ['Oh', ',','really','?','!']
spaces =[False, True,False,False, False]

In [111]:
doc = Doc(nlp.vocab, words =words, spaces = spaces)
print(doc.text)

Oh, really?!


In [112]:
words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

In [113]:
doc = Doc(nlp.vocab, words = words, spaces =spaces)
print(doc.text)

I like David Bowie


In [114]:
span = Span(doc, 0,len(words), label="PERSON")

In [115]:
print(span.text, span.label_)

I like David Bowie PERSON


In [116]:
doc.ents = [span]

In [117]:
#print entities text and labels
ent_dictionary =[(ent.text, ent.label_) for ent in doc.ents ]
ent_dictionary

[('I like David Bowie', 'PERSON')]

In [118]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('I like David Bowie', 'PERSON')]


In [119]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]

In [120]:
for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)

Found proper noun before a verb: Berlin


In [121]:
pos_tags

['PROPN', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN']

In [122]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

In [123]:
# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]

In [124]:
token_texts

['Berlin', 'looks', 'like', 'a', 'nice', 'city']

In [125]:
pos_tags

['PROPN', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN']

In [126]:
for token in doc:
    if token.pos_ =='PROPN':
        if doc[token.i +1].pos_=='VERB':
            print("Found proper noun before a verb: ",result)

Found proper noun before a verb:  Berlin


In [127]:
spacy.en_core_web_md.load()

AttributeError: module 'spacy' has no attribute 'en_core_web_md'

In [None]:
#compare to docs
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

In [128]:
#compare two tokens
doc =nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.33249244


  """


In [129]:
#compare a doc with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]
print(doc.similarity(token))

0.5977857334990669


  after removing the cwd from sys.path.


In [130]:
#compare a span with a doc
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")
print(span.similarity(doc))

0.1936762129808686


  after removing the cwd from sys.path.


In [131]:
#by default spacy uses cosing similarity but can be changed
doc = nlp("I have a banana")
#access the vector via the token.vector attribute
print(doc[3].vector)

[ 1.537847    0.674669    0.81816936 -0.31876066  0.25718442 -0.07516965
 -1.3219695  -1.1152909  -0.88222015 -1.6240865   1.1105574  -0.5426203
  0.37965146  0.17833772 -0.5565369   1.4447467   0.13430384  0.18257456
  0.0947336  -0.51197755 -0.3939239   0.10124579  0.8465748   0.17065091
 -1.1758325   0.4015198  -0.18181653  0.13884822 -0.37730262  0.27910578
 -1.149448   -1.1830585  -0.1151062  -0.67754376  1.4041657   0.5072225
  0.07550949 -0.41542178  0.56449896  1.2630715  -0.7476871  -0.41521612
 -0.8185774   0.58489347 -0.37713623  0.4662792   0.4889022   0.4801488
  0.7555161  -0.3921852   0.38810757  0.27768224 -0.20140396 -0.5345619
  0.15200213 -0.75167155  0.7840755   0.3110958   0.5446518   0.16671108
 -0.6997187  -0.20940918 -1.0355877  -0.4493441  -1.0111964   0.43608385
 -0.59638846 -0.2180395   0.79975367  0.3287603   0.42465413 -0.54750097
  0.45800146 -1.3746465  -0.1122399  -0.27763984 -0.17026226 -0.34077564
  0.6465534   0.49907914 -0.50817263  0.47405154 -0.040

In [132]:
# Process a text
doc = nlp("Two bananas in pyjamas")

In [133]:
bananas_vector = doc[1].vector
print(bananas_vector)

[ 2.2125378  -0.1723251   0.14110796 -0.5951412   0.2514304   0.8077075
  0.801046   -0.13926005  0.07570773 -0.90173185  0.24337415  0.09483004
 -0.6800895  -0.896325   -0.39681435  0.73215634  0.6143893  -0.41026342
 -0.32692614  0.8910814  -0.68614787  0.05868483 -1.2385864   0.26628703
 -1.2001456   0.37203482  1.3134516   0.5492035   0.78106725 -0.77097523
 -0.96711314 -0.08746776 -0.24575046  0.31911474  0.06433463  0.3386125
 -0.30297637 -0.44616336 -0.64934415  0.6333152  -0.7792538  -1.1432259
 -1.1393194  -0.23468877  0.87287223  0.28144976  0.55673593 -0.99859244
  0.53777367  0.47276667  0.04158917  0.637307   -0.01601145 -0.56206167
  0.7756491  -0.20966733  0.46087623 -0.8117031   1.0722431   1.098269
 -0.30064604 -1.0081534  -0.7756756  -1.5946865  -0.536037    0.13774268
  0.11212985  0.75341666 -1.3371096  -0.64456064  1.218224   -0.37610805
  0.3524229   0.6260698  -0.48317793  0.5585091  -0.11791973 -1.0562748
 -0.06546868 -0.95770586  0.09769988 -0.70195216  0.91102

In [134]:
doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

In [135]:
# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)

0.35458708


  


In [136]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

In [137]:
# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

In [138]:
similarity = span1.similarity(span2)
print(similarity)

0.6904658


  """Entry point for launching an IPython kernel.


In [139]:
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", [pattern])

In [140]:
doc = nlp("I love cats and I'm very very happy")
matches =matcher(doc)

In [141]:
matches

[(9137535031263442622, 1, 3)]

In [142]:
for match_id, start,end in matches:
    print(doc[start:end].text)

love cats


In [143]:
matcher.add("DOG",[[{"LOWER":"golden"}, {"LOWER":"retriever"}]])
doc = nlp("I have a Golden Retriever")

In [144]:
doc.text

'I have a Golden Retriever'

In [145]:
for match_id, start,end in matcher(doc):
    span = doc[start:end]
    print("Matched span: ", span.text)
    print("Root token: ", span.root.text)
    print("Root head token: ", span.root.head.text) 
    print("Previous token: ", doc[start-1].text, doc[start-1].pos_)

Matched span:  Golden Retriever
Root token:  Retriever
Root head token:  have
Previous token:  a DET


In [146]:
from spacy.matcher import PhraseMatcher

In [147]:
matcher = PhraseMatcher(nlp.vocab)

In [148]:
pattern = nlp("Golden Retriever")
matcher.add("Dog",None,pattern)
doc = nlp("I have a Golden Retriever")

In [149]:
for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span: ", span.text)

Matched span:  Golden Retriever


In [150]:
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)


In [151]:
# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"TEXT": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]


In [152]:
# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2",  [pattern2])

In [153]:
# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [154]:
nlp = spacy.load("en_core_web_sm")

In [183]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [184]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7faabb1ea468>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7faab85ab518>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7faab859f730>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7faab859f938>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7faabb2aa548>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7faabb2c13c8>)]


In [199]:
import spacy
from spacy.language import Language

In [205]:
@Language.component("length")
# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    return doc

In [206]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("length", first = True)
print(nlp.pipe_names)

['length', 'tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [207]:
doc = nlp("This is a sentence.")

This document is 5 tokens long.


In [208]:
print(nlp.pipe_names)

['length', 'tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [209]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span


In [210]:
nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]


In [213]:
@Language.component("animal")
# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

In [214]:
# Add the component to the pipeline after the "ner" component
nlp.add_pipe("animal", after="ner")
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'animal', 'attribute_ruler', 'lemmatizer']


In [215]:
# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]
