In [1]:
from spacy.lang.en import English

In [2]:
nlp = English()

In [3]:
doc = nlp("Hello, my name is Tiffany!")

In [4]:
for token in doc:
    print(token.text)

Hello
,
my
name
is
Tiffany
!


In [5]:
name = doc[5]
name

Tiffany

In [6]:
my_name = doc[2:4]
my_name

my name

In [7]:
print(my_name.text)

my name


In [8]:
print("Index:  ", [token.i for token in doc])

Index:   [0, 1, 2, 3, 4, 5, 6]


In [9]:
print("Text:  ", [token.text for token in doc])

Text:   ['Hello', ',', 'my', 'name', 'is', 'Tiffany', '!']


In [10]:
print("is_alpha:  ", [token.is_alpha for token in doc])

is_alpha:   [True, False, True, True, True, True, False]


In [11]:
print("is_punct:  ", [token.is_punct for token in doc])

is_punct:   [False, True, False, False, False, False, True]


In [12]:
print("like_num:  ", [token.like_num for token in doc])

like_num:   [False, False, False, False, False, False, False]


In [13]:
doc = nlp("This is a sentence.")
print(doc.text)

This is a sentence.


In [14]:
doc = nlp("I like tree kangaroos and narwhals.")
print(doc.text)

I like tree kangaroos and narwhals.


In [15]:
first_token = doc[0].text
first_token

'I'

In [16]:
for i in range(2,6):
    print(doc[i].text)

tree
kangaroos
and
narwhals


In [17]:
without_loop = doc[2:6].text
without_loop

'tree kangaroos and narwhals'

In [18]:
doc = nlp( "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.")
doc[0].text

'In'

In [19]:
#if token is like_num, look behind it to check for % symbol
for token in doc:
    if token.like_num:
        next_token =doc[token.i +1] 
        if next_token.text =="%":  
            print('Percentage found:  ',token.text, next_token)

Percentage found:   60 %
Percentage found:   4 %


In [20]:
import spacy
nlp = spacy.load("en_core_web_sm") #small model package


In [21]:
doc = nlp("She ate the pizza")

In [22]:
for token in doc:
    print(token.text, token.pos_) #part of speech

She PRON
ate VERB
the DET
pizza NOUN


In [23]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text) #dep dependencies 
                                                                                        #head is parent token it's attached to

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [24]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [25]:
#iterate over the predicited entities
#print entity text and its label
for ent in doc.ents:
    print(ent.text, ent.label_) 

Apple ORG
U.K. GPE
$1 billion MONEY


In [26]:
#use spacy explain for quick definitions
spacy.explain("GPE")


'Countries, cities, states'

In [27]:
spacy.explain("NNP")

'noun, proper singular'

In [28]:
spacy.explain("nsubj")

'nominal subject'

In [29]:
text = "It's official:  Apple is the first U.S public company to reach a $1 trillion market value"

In [30]:
doc = nlp(text)
doc

It's official:  Apple is the first U.S public company to reach a $1 trillion market value

In [31]:
doc[1]

's

In [32]:
doc.text

"It's official:  Apple is the first U.S public company to reach a $1 trillion market value"

In [33]:
for token in doc:
    print(token.text)

It
's
official
:
 
Apple
is
the
first
U.S
public
company
to
reach
a
$
1
trillion
market
value


In [34]:
doc[3].text

':'

In [35]:
doc[2].text

'official'

In [36]:
for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print(f"{token_text: <15} {token_pos: <12} {token_dep: <15}")

It              PRON         nsubj          
's              AUX          ROOT           
official        ADJ          acomp          
:               PUNCT        punct          
                SPACE        attr           
Apple           PROPN        nsubj          
is              AUX          ROOT           
the             DET          det            
first           ADJ          amod           
U.S             PROPN        nmod           
public          ADJ          amod           
company         NOUN         attr           
to              PART         aux            
reach           VERB         relcl          
a               DET          det            
$               SYM          quantmod       
1               NUM          compound       
trillion        NUM          nummod         
market          NOUN         compound       
value           NOUN         dobj           


In [37]:
spacy.explain("AUX")

'auxiliary'

In [38]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S GPE
$1 trillion MONEY


In [39]:
spacy.explain("GPE")

'Countries, cities, states'

In [40]:
text =  [token.text for token in doc]
text

['It',
 "'s",
 'official',
 ':',
 ' ',
 'Apple',
 'is',
 'the',
 'first',
 'U.S',
 'public',
 'company',
 'to',
 'reach',
 'a',
 '$',
 '1',
 'trillion',
 'market',
 'value']

In [41]:
ent = [ent.text for ent in doc.ents]
ent

['Apple', 'first', 'U.S', '$1 trillion']

In [42]:
ent_labels =  [ent.label_ for ent in doc.ents]
ent_labels

['ORG', 'ORDINAL', 'GPE', 'MONEY']

In [43]:
dict((ent.text, ent.label_) for ent in doc.ents)
    

{'Apple': 'ORG', 'first': 'ORDINAL', 'U.S': 'GPE', '$1 trillion': 'MONEY'}

In [44]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

In [45]:
doc = nlp(text)
doc.text

'Upcoming iPhone X release date leaked as Apple reveals pre-orders'

In [46]:
text_list = [token.text for token in doc]

In [47]:
text_list

['Upcoming',
 'iPhone',
 'X',
 'release',
 'date',
 'leaked',
 'as',
 'Apple',
 'reveals',
 'pre',
 '-',
 'orders']

In [48]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG


In [49]:
#span for "iPhone X"
iphone_x = doc[1:3]
iphone_x

iPhone X

In [50]:
iphone_x.text

'iPhone X'

In [51]:
[{"Text": "iphone"}, {"Text": "X"}]

[{'Text': 'iphone'}, {'Text': 'X'}]

The Matcher lets you find words and phrases using rules describing their token attributes. 

Rules can refer to token annotations (like the text or part-of-speech tags), as well as lexical attributes like Token.is_punct. 

Applying the matcher to a Doc gives you access to the matched tokens in context. 

In [74]:
from spacy.matcher import Matcher


In [75]:
# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

In [76]:
doc = nlp("Upcoming iPhone X release date leaked as Apple")

In [80]:
# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

In [81]:
matcher.add("Iphone_x_pattern",[pattern])

In [82]:
# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [83]:
doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

In [84]:
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

In [87]:
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTER",[pattern])

In [92]:
matches = matcher(doc)
print("Total matches found:", len(matches))

Total matches found: 3


In [93]:
matches

[(16864847348423454655, 24, 26),
 (16864847348423454655, 29, 31),
 (16864847348423454655, 38, 40)]

In [94]:
# Iterate over the matches and print the span text
for match_id, start,end in matches:
    print("Match found: ", doc[start:end].text)

Match found:  iOS 7
Match found:  iOS 11
Match found:  iOS 10


In [95]:
doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

In [100]:
# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "NNP"}]

In [101]:
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])

In [102]:
matches = matcher(doc)
print("Total matches found: ", len(matches))

Total matches found:  0
