In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
# Matching iPhone X with rule based matching
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

In [4]:
for token in doc:
    print(token)

Upcoming
iPhone
X
release
date
leaked
as
Apple
reveals
pre
-
orders


In [5]:
for ent in doc.ents:
    print(ent)

Apple


In [6]:
matcher = Matcher(nlp.vocab)
pattern = [{'TEXT':'iPhone'}, {'TEXT':'X'}]
matcher.add('IPHONE_X', [pattern])

In [7]:
matches = matcher(doc)
matches

[(5243344123226910127, 1, 3)]

In [8]:
for match_id, start, end in matches:
    print(f'Match ID: {match_id}, Matches: {doc[start:end]}')

Match ID: 5243344123226910127, Matches: iPhone X


In [9]:
doc2 = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

In [10]:
matcher2 = Matcher(nlp.vocab)
pattern = [{'TEXT':'iOS'}, {'IS_DIGIT': True}]
matcher2.add('IOS_VERSIONS', [pattern])

In [11]:
matches2 = matcher2(doc2)
matches2

[(4824293791889660742, 24, 26),
 (4824293791889660742, 29, 31),
 (4824293791889660742, 38, 40)]

In [12]:
for match_id, start, end in matches2:
    print(f'Match ID: {match_id}, Matches: {doc2[start:end]}')

Match ID: 4824293791889660742, Matches: iOS 7
Match ID: 4824293791889660742, Matches: iOS 11
Match ID: 4824293791889660742, Matches: iOS 10


In [13]:
doc3 = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

In [14]:
matcher3 = Matcher(nlp.vocab)
pattern = [{'LEMMA':'download'},{'POS':'PROPN'}]
matcher3.add('IDENTIFY_TENSES', [pattern])
matches3 = matcher3(doc3)

In [15]:
for match_id, start, end in matches3:
    print(f'Match ID: {match_id}, Matches: {doc3[start:end]}')

Match ID: 7211707697680833343, Matches: downloaded Fortnite
Match ID: 7211707697680833343, Matches: downloading Minecraft
Match ID: 7211707697680833343, Matches: download Winzip


In [16]:
doc4 = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

In [17]:
matcher4 = Matcher(nlp.vocab)
pattern = [{'POS':'ADJ'}, {'POS':'NOUN'},{'POS':'NOUN', 'OP':'?'}]
matcher4.add('Hkzj',[pattern])
matches4 = matcher4(doc4)

In [18]:
for match_id, start, end in matches4:
    print(f'Match ID: {match_id}, Matches: {doc4[start:end]}')

Match ID: 12861291358532660963, Matches: beautiful design
Match ID: 12861291358532660963, Matches: smart search
Match ID: 12861291358532660963, Matches: automatic labels
Match ID: 12861291358532660963, Matches: optional voice
Match ID: 12861291358532660963, Matches: optional voice responses


In [19]:
type(matcher4)

spacy.matcher.matcher.Matcher

In [20]:
type(matches4)

list

In [21]:
matches4

[(12861291358532660963, 6, 8),
 (12861291358532660963, 9, 11),
 (12861291358532660963, 12, 14),
 (12861291358532660963, 15, 17),
 (12861291358532660963, 15, 18)]

In [22]:
from spacy import vocab

In [23]:
nlp = spacy.blank('en')
doc = nlp('I have a cat')
string_hashid = nlp.vocab.strings['cat']
string_hash = doc.vocab.strings['cat'] 

In [24]:
print(string_hash, string_hashid)

5439657043933447811 5439657043933447811


In [25]:
obtain_string = nlp.vocab.strings[5439657043933447811]
obtain_string

'cat'

In [26]:
sample_doc = nlp("David Bowie is a PERSON")

In [27]:
person_hashid = sample_doc.vocab.strings['PERSON']
person_string = sample_doc.vocab.strings[person_hashid]
print(person_hashid, person_string)

380 PERSON


In [28]:
import spacy

# Create an English and German nlp object
nlp = spacy.blank("en")
nlp_de = spacy.blank("de")

# Get the ID for the string 'Bowie'
bowie_id = nlp.vocab.strings["Bowie"]
print(bowie_id)

# Look up the ID for "Bowie" in the vocab
print(nlp_de.vocab.strings[bowie_id])

2644858412616767388


KeyError: "[E018] Can't retrieve string for hash '2644858412616767388'. This usually refers to an issue with the `Vocab` or `StringStore`."

In [29]:
from spacy.tokens import Doc,Span
from spacy.vocab import Vocab

In [30]:
nlp = spacy.blank('en')
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [31]:
doc.text

'spaCy is cool!'

In [32]:
words2 = ["Go", ",", "get", "started", "!"]
spaces = [False, True, True, False, False]
doc2 = Doc(nlp.vocab, spaces=spaces, words=words2)
doc2.text

'Go, get started!'

In [33]:
words3 = ['Oh', ',', 'really', '?', '!']
spaces3 = [False, True, False, False, False]
doc3 = Doc(nlp.vocab, words3, spaces3)
doc3.text

'Oh, really?!'

In [34]:
doc3.ents

()

In [35]:
words4 = ["I", "like", "David", "Bowie"]
spaces4 = [True, True, True, False]
doc4 = Doc(nlp.vocab, words4, spaces4)
print(doc4.text)

span = Span(doc4, 2, 4, label = 'PERSON')
print(span.text, span.label_)
print(doc4.ents)

I like David Bowie
David Bowie PERSON
()


In [36]:
# Adding span to doc's entities
doc4.ents = [span]

In [37]:
doc4.ents

(David Bowie,)

In [38]:
for ent in doc4.ents:
    print(ent.text, ent.label_)

David Bowie PERSON


In [39]:
nlp = spacy.load('en_core_web_sm')
sample_doc = nlp("Berlin looks like a nice city")

In [40]:
for token in sample_doc:
    if token.pos_ == 'PROPN':
        if sample_doc[token.i + 1].pos_ == 'VERB':
            print(token.text)

Berlin


In [41]:
for token in sample_doc:
    print(token, token.pos_)

Berlin PROPN
looks VERB
like ADP
a DET
nice ADJ
city NOUN


In [42]:
nlp = spacy.load('en_core_web_md')
doc = nlp('Two bananas in pyjamas')
bananas_vector = doc[1].vector
bananas_vector

array([-0.6334   ,  0.18981  , -0.53544  , -0.52658  , -0.30001  ,
        0.30559  , -0.49303  ,  0.14636  ,  0.012273 ,  0.96802  ,
        0.0040354,  0.25234  , -0.29864  , -0.014646 , -0.24905  ,
       -0.67125  , -0.053366 ,  0.59426  , -0.068034 ,  0.10315  ,
        0.66759  ,  0.024617 , -0.37548  ,  0.52557  ,  0.054449 ,
       -0.36748  , -0.28013  ,  0.090898 , -0.025687 , -0.5947   ,
       -0.24269  ,  0.28603  ,  0.686    ,  0.29737  ,  0.30422  ,
        0.69032  ,  0.042784 ,  0.023701 , -0.57165  ,  0.70581  ,
       -0.20813  , -0.03204  , -0.12494  , -0.42933  ,  0.31271  ,
        0.30352  ,  0.09421  , -0.15493  ,  0.071356 ,  0.15022  ,
       -0.41792  ,  0.066394 , -0.034546 , -0.45772  ,  0.57177  ,
       -0.82755  , -0.27885  ,  0.71801  , -0.12425  ,  0.18551  ,
        0.41342  , -0.53997  ,  0.55864  , -0.015805 , -0.1074   ,
       -0.29981  , -0.17271  ,  0.27066  ,  0.043996 ,  0.60107  ,
       -0.353    ,  0.6831   ,  0.20703  ,  0.12068  ,  0.2485

In [43]:
nlp = spacy.load('en_core_web_md')
doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")
print(doc1.similarity(doc2))

0.845685397409251


In [44]:
token1 = nlp('TV')[0]
token2 = nlp('books')[0]
print(token1.similarity(token2))
print(token2.similarity(token1))

0.18317238986492157
0.18317238986492157


In [45]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")
span1 = doc[3:5]
span2 = doc[11:]
print(span1.similarity(span2))

0.7609225511550903


In [46]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"TEXT": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {'TEXT':'-'}, {'LOWER':'free'}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2", [pattern2])

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [47]:
import json

In [48]:
with open('./countries.json', encoding='utf8') as f:
    COUNTRIES = json.loads(f.read())
type(COUNTRIES)

list

In [49]:
from spacy.matcher import PhraseMatcher
nlp = spacy.blank("en")
doc = nlp("Czech Republic may help Slovakia protect its airspace")

In [50]:
matcher = PhraseMatcher(nlp.vocab)
pattern = list(nlp.pipe(COUNTRIES))
matcher.add('COUNTRY', pattern)
matches = matcher(doc)

In [51]:
for match_id, start, end in matches:
    print(doc[start:end])

Czech Republic
Slovakia


In [52]:
with open('./country_text.txt', encoding='utf8') as file:
    TEXT = file.read()

In [53]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(TEXT)
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

In [54]:
patterns

[Afghanistan,
 Åland Islands,
 Albania,
 Algeria,
 American Samoa,
 Andorra,
 Angola,
 Anguilla,
 Antarctica,
 Antigua and Barbuda,
 Argentina,
 Armenia,
 Aruba,
 Australia,
 Austria,
 Azerbaijan,
 Bahamas,
 Bahrain,
 Bangladesh,
 Barbados,
 Belarus,
 Belgium,
 Belize,
 Benin,
 Bermuda,
 Bhutan,
 Bolivia (Plurinational State of),
 Bonaire, Sint Eustatius and Saba,
 Bosnia and Herzegovina,
 Botswana,
 Bouvet Island,
 Brazil,
 British Indian Ocean Territory,
 United States Minor Outlying Islands,
 Virgin Islands (British),
 Virgin Islands (U.S.),
 Brunei Darussalam,
 Bulgaria,
 Burkina Faso,
 Burundi,
 Cambodia,
 Cameroon,
 Canada,
 Cabo Verde,
 Cayman Islands,
 Central African Republic,
 Chad,
 Chile,
 China,
 Christmas Island,
 Cocos (Keeling) Islands,
 Colombia,
 Comoros,
 Congo,
 Congo (Democratic Republic of the),
 Cook Islands,
 Costa Rica,
 Croatia,
 Cuba,
 Curaçao,
 Cyprus,
 Czech Republic,
 Denmark,
 Djibouti,
 Dominica,
 Dominican Republic,
 Ecuador,
 Egypt,
 El Salvador,
 Equa

In [55]:
doc.ents = []

In [56]:
for match_id, start, end in matcher(doc):
    span = Span(doc, start, end, label="GPE")
    doc.ents = list(doc.ents) + [span]
    span_root_head = span.root.head
    print(span_root_head.text, span.text)

in Namibia
in South Africa
Africa Cambodia
of Kuwait
as Somalia
Somalia Haiti
Haiti Mozambique
in Somalia
for Rwanda
Britain Singapore
War Sierra Leone
of Afghanistan
invaded Iraq
in Sudan
of Congo
earthquake Haiti


In [57]:
doc.ents

(Namibia,
 South Africa,
 Cambodia,
 Kuwait,
 Somalia,
 Haiti,
 Mozambique,
 Somalia,
 Rwanda,
 Singapore,
 Sierra Leone,
 Afghanistan,
 Iraq,
 Sudan,
 Congo,
 Haiti)