In [2]:
import sys

In [3]:
print(sys.executable)

/Users/monty/anaconda3/bin/python


In [29]:
#!~/anaconda3/bin/python -m pip install spacy

In [5]:
import spacy

In [30]:
#!~/anaconda3/bin/python -m pip install -U spacy-lookups-data

In [31]:
#!~/anaconda3/bin/python -m spacy download en_core_web_sm

In [32]:
#!~/anaconda3/bin/python -m spacy download fr_core_news_sm

In [9]:
print(spacy.__version__)

2.2.4


# 1. Tokenization

Tokenization is the process of breaking up original text into small units (tokens).

Tokenization does this task by locating word boundaries. Ending point of a word and beginning of the next word is called word boundary.

In [10]:
# Import spaCy
import spacy

# load the English language library
# This is a model instance. Pipeline: tagger, parser, ner

nlp = spacy.load(name='en_core_web_sm')

In [11]:
# Define a string
text = ('Apple is looking at buying U.K. startup for $1 billion !')
print(text)

Apple is looking at buying U.K. startup for $1 billion !


In [12]:
# Create a doc object and explore tokens
doc = nlp(text)

for token in doc:
    print(token.text)

# every single unit is a token here, including doller symbol, number and exclamation mark
# spaCy will isolate punctuation that does not form an integral part of a word
# like Quotation marks, commas and punctuation, they will be assigned their own token

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
!


In [13]:
# However, punctuation that exists as part of an email address, website or numerical value will be kept as part of the token

doc_2 = nlp('Hello all, We are here to help you! email support@udemy.com or visit us at http://www.udemy.com!')

for token in doc_2:
    print(token.text)

Hello
all
,
We
are
here
to
help
you
!
email
support@udemy.com
or
visit
us
at
http://www.udemy.com
!


In [14]:
doc_3 = nlp('10km cab ride almost costs $20 in NYC')

for token in doc_3:
    print(token.text)

# here the distance unit (km) and dollar sign assigned their own token

10
km
cab
ride
almost
costs
$
20
in
NYC


In [16]:
# Punctuation that exists as part of a known abbreviation will be kept as part of the token

doc_4 = nlp("'Let\'s watch a movie together.'")

for token in doc_4:
    print(token.text)

'
Let
's
watch
a
movie
together
.
'


# 2. Tokens indexing and slicing 

In [19]:
mystr = "Hello"
type(mystr)

str

In [17]:
type(doc_4) # just like with Python types, SpaCy has types: Doc in this case (~ string)

spacy.tokens.doc.Doc

In [18]:
len(doc_4)

9

In [21]:
doc_4[5] # just like with Python, parts of the Doc are indexed

movie

In [23]:
# Can do slicing also:
doc_4[3:6]

watch a movie

In [25]:
doc_4[-2]

.

In [26]:
doc_4[-2] = "nice day" # cannot re-assign, just like with a String type

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [28]:
# Counting Vocab Entries
len(doc_4.vocab)
# that means when we loaded up 'en_core_web_sm' library, that has a vocabulary 511 different types of tokens 

511

# 3. Named Entities

In [33]:
# NER from SpaCy's knowledge, from pre-compiled model en_core_web_sm

doc_6 = nlp('Apple is looking at buying U.K. startup for $1 billion')

for token in doc_6:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [34]:
for ent in doc_6.ents:
    print(ent)
# Spacy is able to recognize Apple, U.K. and $1 billion are the named entities
# these named entities are giving more information

Apple
U.K.
$1 billion


In [35]:
# we can also print the labels, and details about the entity

for ent in doc_6.ents:
    print(ent)                             # printing entity
    print(ent.label_)                      # printing entity label
    print(str(spacy.explain(ent.label_)))  # printing entity label details
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


U.K.
GPE
Countries, cities, states


$1 billion
MONEY
Monetary values, including unit




# 4. Noun chunks

In [36]:
# Noun chunks are very similar to doc.ents
# In simple words we can say that, noun + words that describe a particular noun

doc_7 = nlp("Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc_7.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


# 5. Built-in vizualisers

In [44]:
from spacy import displacy

doc_8 = nlp("Apple is looking at buying U.K. startup for $1 billion !")

In [45]:
displacy.render(docs=doc_8,style='dep',jupyter=True,options={'distance':80})
# distance is the distance between tokens

In [46]:
# Visualizing the entity recognizer

displacy.render(docs=doc_8, style='ent', jupyter=True)

In [48]:
doc_8.user_data["title"] = "This is a title"
displacy.render(doc_8, style="ent", jupyter=True)