In [1]:
from spacy.lang.en import English

In [2]:
nlp = English()

In [3]:
doc = nlp("Hello, my name is Tiffany!")

In [4]:
for token in doc:
    print(token.text)

Hello
,
my
name
is
Tiffany
!


In [5]:
name = doc[5]
name

Tiffany

In [6]:
my_name = doc[2:4]
my_name

my name

In [7]:
print(my_name.text)

my name


In [8]:
print("Index:  ", [token.i for token in doc])

Index:   [0, 1, 2, 3, 4, 5, 6]


In [9]:
print("Text:  ", [token.text for token in doc])

Text:   ['Hello', ',', 'my', 'name', 'is', 'Tiffany', '!']


In [10]:
print("is_alpha:  ", [token.is_alpha for token in doc])

is_alpha:   [True, False, True, True, True, True, False]


In [11]:
print("is_punct:  ", [token.is_punct for token in doc])

is_punct:   [False, True, False, False, False, False, True]


In [12]:
print("like_num:  ", [token.like_num for token in doc])

like_num:   [False, False, False, False, False, False, False]


In [13]:
doc = nlp("This is a sentence.")
print(doc.text)

This is a sentence.


In [14]:
doc = nlp("I like tree kangaroos and narwhals.")
print(doc.text)

I like tree kangaroos and narwhals.


In [15]:
first_token = doc[0].text
first_token

'I'

In [16]:
for i in range(2,6):
    print(doc[i].text)

tree
kangaroos
and
narwhals


In [17]:
without_loop = doc[2:6].text
without_loop

'tree kangaroos and narwhals'

In [18]:
doc = nlp( "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.")
doc[0].text

'In'

In [19]:
#if token is like_num, look behind it to check for % symbol
for token in doc:
    if token.like_num:
        next_token =doc[token.i +1] 
        if next_token.text =="%":  
            print('Percentage found:  ',token.text, next_token)

Percentage found:   60 %
Percentage found:   4 %


In [21]:
import spacy
nlp = spacy.load("en_core_web_sm") #small model package


In [22]:
doc = nlp("She ate the pizza")

In [23]:
for token in doc:
    print(token.text, token.pos_) #part of speech

She PRON
ate VERB
the DET
pizza NOUN


In [35]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text) #dep dependencies 
                                                                                        #head is parent token it's attached to

Apple PROPN nsubj looking
is AUX aux looking
looking VERB ROOT looking
at ADP prep looking
buying VERB pcomp at
U.K. PROPN dobj buying
startup NOUN advcl looking
for ADP prep startup
$ SYM quantmod billion
1 NUM compound billion
billion NUM pobj for


In [36]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [37]:
#iterate over the predicited entities
#print entity text and its label
for ent in doc.ents:
    print(ent.text, ent.label_) 

Apple ORG
U.K. GPE
$1 billion MONEY


In [38]:
#use spacy explain for quick definitions
spacy.explain("GPE")


'Countries, cities, states'

In [39]:
spacy.explain("NNP")

'noun, proper singular'

In [40]:
spacy.explain("nsubj")

'nominal subject'

In [41]:
text = "It's official:  Apple is the first U.S public company to reach a $1 trillion market value"

In [43]:
doc = nlp(text)
doc

It's official:  Apple is the first U.S public company to reach a $1 trillion market value

In [44]:
doc[1]

's

In [45]:
doc.text

"It's official:  Apple is the first U.S public company to reach a $1 trillion market value"

In [46]:
for token in doc:
    print(token.text)

It
's
official
:
 
Apple
is
the
first
U.S
public
company
to
reach
a
$
1
trillion
market
value


In [48]:
doc[3].text

':'

In [49]:
doc[2].text

'official'

In [54]:
for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print(f"{token_text: <15} {token_pos: <12} {token_dep: <15}")

It              PRON         nsubj          
's              AUX          ROOT           
official        ADJ          acomp          
:               PUNCT        punct          
                SPACE        attr           
Apple           PROPN        nsubj          
is              AUX          ROOT           
the             DET          det            
first           ADJ          amod           
U.S             PROPN        nmod           
public          ADJ          amod           
company         NOUN         attr           
to              PART         aux            
reach           VERB         relcl          
a               DET          det            
$               SYM          quantmod       
1               NUM          compound       
trillion        NUM          nummod         
market          NOUN         compound       
value           NOUN         dobj           


In [56]:
spacy.explain("AUX")

'auxiliary'

In [57]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S GPE
$1 trillion MONEY


In [58]:
spacy.explain("GPE")

'Countries, cities, states'

In [67]:
text =  [token.text for token in doc]
text

['It',
 "'s",
 'official',
 ':',
 ' ',
 'Apple',
 'is',
 'the',
 'first',
 'U.S',
 'public',
 'company',
 'to',
 'reach',
 'a',
 '$',
 '1',
 'trillion',
 'market',
 'value']

In [70]:
ent = [ent.text for ent in doc.ents]
ent

['Apple', 'first', 'U.S', '$1 trillion']

In [72]:
ent_labels =  [ent.label_ for ent in doc.ents]
ent_labels

['ORG', 'ORDINAL', 'GPE', 'MONEY']

In [74]:
dict((ent.text, ent.label_) for ent in doc.ents)
    

{'Apple': 'ORG', 'first': 'ORDINAL', 'U.S': 'GPE', '$1 trillion': 'MONEY'}

In [76]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

In [79]:
doc = nlp(text)
doc.text

'Upcoming iPhone X release date leaked as Apple reveals pre-orders'

In [80]:
text_list = [token.text for token in doc]

In [81]:
text_list

['Upcoming',
 'iPhone',
 'X',
 'release',
 'date',
 'leaked',
 'as',
 'Apple',
 'reveals',
 'pre',
 '-',
 'orders']

In [82]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG


In [83]:
#span for "iPhone X"
iphone_x = doc[1:3]
iphone_x

iPhone X

In [84]:
iphone_x.text

'iPhone X'

In [85]:
[{"Text": "iphone"}, {"Text": "X"}]

[{'Text': 'iphone'}, {'Text': 'X'}]

In [91]:
from spacy.matcher import Matcher

In [92]:
doc

Upcoming iPhone X release date leaked as Apple reveals pre-orders

In [93]:
matcher = Matcher(nlp.vocab)

In [95]:
pattern = [{"Text": "iphone"}, {"Text": "X"}]

In [98]:
matcher.add("Iphone_x_pattern", None, [pattern])

TypeError: add() takes exactly 2 positional arguments (3 given)