In [None]:
# You can now load the package via spacy.load('en_core_web_sm')

In [1]:
import spacy

In [2]:
# load model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Create doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million.')

In [5]:
for token in doc:
    print(token.text)

Tesla
is
looking
at
buying
U.S.
startup
for
$
6
million
.


In [6]:
# pos - Part of Speech - each of these numbers actually corresponds with a part of speech,
# like an adverb, a verb, a noun, a conjugation, etcetera.
for token in doc:
    print(token.text, token.pos)

Tesla 96
is 87
looking 100
at 85
buying 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93
. 97


In [7]:
# To get the raw name instead of numbers as seen above, we can do the following:
for token in doc:
    print(token.text, token.pos_)

Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM
. PUNCT


In [8]:
# DEP stands for syntactic dependency
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj
. PUNCT punct


In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x20177819280>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x20177819e80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x201775af6d0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2017789c680>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2017788bc00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x201775af4a0>)]

In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [11]:
# Tokens

In [12]:
doc2 = nlp(u"Tesla isn't looking into startups anymore")

In [13]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod


In [14]:
doc2[0]

Tesla

In [15]:
doc2[0].pos_

'PROPN'

In [16]:
type(doc2[0].pos_)

str

In [18]:
doc2[0].dep_

'nsubj'

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

___
## Spans
Large Doc objects can be hard to work with at times. A **span** is a slice of Doc object in the form `Doc[start:stop]`.

In [19]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [23]:
doc3[2]

attributed

In [20]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [21]:
type(life_quote)

spacy.tokens.span.Span

In [22]:
type(doc3)

spacy.tokens.doc.Doc

In [24]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [25]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [26]:
# Is word start of the sentence?
doc4[5].is_sent_start

False

Tokenization: 

Tokenization is the process of breaking up the original raw text into component pieces otherwise known as tokens.

-  **Prefix**:	Character(s) at the beginning &#9656; `$ ( “ ¿`
-  **Suffix**:	Character(s) at the end &#9656; `km ) , . ! ”`
-  **Infix**:	Character(s) in between &#9656; `- -- / ...`
-  **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied &#9656; `St. U.S.`

In [27]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [28]:
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [29]:
doc = nlp(mystring)

In [30]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [31]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at https://www.oursite.com!")

In [32]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
https://www.oursite.com
!


In [33]:
doc3 = nlp(u"A 5km NYC cab ride cost $10.30")

In [34]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
cost
$
10.30


In [35]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [36]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [37]:
len(doc4)

11

In [38]:
# vocab entries
doc4.vocab

<spacy.vocab.Vocab at 0x20177841f70>

In [39]:
len(doc4.vocab)

794

In [40]:
doc5 = nlp(u"It is better to give than receive. ")

In [41]:
doc5[0]

It

In [42]:
doc5[2:5]

better to give

In [43]:
# connot reassign
# doc5[0] = "test"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [44]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [47]:
for token in doc8:
    print(token.text, end=" | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [50]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [51]:
# Noun chunks

doc9 = nlp(u"Autonomous cars shift insurance liability towards manufactures.")

In [52]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufactures


In [53]:
# Visualize tokens
from spacy import displacy

In [54]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [56]:
displacy.render(doc, style="dep", jupyter=True, options={'distance':110})

In [57]:
doc2 = nlp(u"Over the last quarter ApPle sold nearly 20 thousand iPods for a profit of $6 million.")

In [58]:
displacy.render(doc2, style="ent",
                jupyter=True, options={'distance':110})

In [59]:
# For .py scripts
doc3 = nlp(u"This is a sentence")
displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


Stemming

In [60]:
import nltk

In [61]:
from nltk.stem.porter import PorterStemmer

In [73]:
p_stemmer = PorterStemmer()

In [74]:
words = ["run", "runner", "ran", "runs", "easily", "fairly", "fairness"]

In [75]:
for word in words:
    print(word + " ------> " + p_stemmer.stem(word))

run ------> run
runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
fairly ------> fairli
fairness ------> fair


In [76]:
from nltk.stem.snowball import SnowballStemmer

In [77]:
s_stemmer = SnowballStemmer(language="english")

In [78]:
for word in words:
    print(word + " -------> " + s_stemmer.stem(word))

run -------> run
runner -------> runner
ran -------> ran
runs -------> run
easily -------> easili
fairly -------> fair
fairness -------> fair


In [79]:
words = ["generous", "generation", "generously", "generate"]

In [80]:
for word in words:
    print(word + " -------> " + s_stemmer.stem(word))

generous -------> generous
generation -------> generat
generously -------> generous
generate -------> generat


Lemmatization

Lemmatization is typically seen as much more informative than simple stemming, which is why the spaCy library
has opted to only have lemmatization available instead of simple stemming.

lemmatization is a more informative way of reducing down words to, really, their true roots.

And it's also gonna take into account the way the words are actually being used in the sentence.

In contrast to stemming, lemmatization looks beyond word reduction and considers a language's full vocabulary 
to apply a morphological analysis towards.

The lemma of was is be, and the lemma of mice is mouse. So we're not just shortening words or cutting off the end of them.
Instead, we're looking at the full context of the word.

In [81]:
import spacy

In [82]:
nlp = spacy.load("en_core_web_sm")

In [83]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [84]:
for token in doc1:
    print(token.text, "\t", token.pos_, "\t", token.lemma, "\t", token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [85]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [86]:
show_lemmas(doc1)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   4690420944186131903    I
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   4690420944186131903    I
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


In [93]:
doc2 = nlp(u"I saw 10 ten mice today!")

In [94]:
show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
10           NUM    6572986864102252890    10
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


lemmatization is a more informative way of reducing down words to, really, their true roots.

And it's also gonna take into account the way the words are actually being used in the sentence.

**Stop Words**

In [95]:
import spacy

In [96]:
nlp = spacy.load("en_core_web_sm")

In [97]:
print(nlp.Defaults.stop_words)

{'top', 'mostly', 'an', 'full', 'were', 'something', 'five', 'sometime', 'n’t', 'four', 'besides', 'out', 'whose', 'whence', 'wherever', 'them', 'back', 'for', 'therefore', 'hereby', 'must', 'among', 'again', 'well', 'sometimes', 'ourselves', 'under', 'through', 'whether', 'whatever', 'would', 'nothing', 'been', 'whereafter', 'even', 'you', 'before', 'formerly', 'along', 'made', 'most', 'into', 'could', 'however', 'together', 'moreover', 'wherein', 'becomes', 'with', 'enough', 'everyone', 'upon', 'yours', 'same', 'our', 'six', 'quite', 'while', 'within', 'part', 'go', 'amongst', 'some', 'hence', 'over', 'namely', 'last', 'yet', 'one', 'since', "'re", 'beyond', 'had', '‘ve', 'other', 'any', 'than', 'whoever', 'we', 'using', 'former', 'me', 'which', "'s", 'is', 'ours', 'due', 'does', "'m", 'or', 'every', 'was', 'after', 'whenever', 'really', 'thereby', 'elsewhere', 'no', 'my', 'his', 'ten', 'sixty', 'anything', 'except', 'in', 'how', 'herself', 'still', 'to', 'can', 'doing', 'that', 'a',

In [98]:
len(nlp.Defaults.stop_words)

326

In [99]:
nlp.vocab['is']

<spacy.lexeme.Lexeme at 0x201004cc200>

In [100]:
nlp.vocab['is'].is_stop

True

In [101]:
nlp.vocab["mystery"].is_stop

False

In [102]:
nlp.Defaults.stop_words.add("btw")

In [103]:
nlp.vocab["btw"].is_stop = True

In [104]:
len(nlp.Defaults.stop_words)

327

In [105]:
nlp.vocab["btw"].is_stop

True

In [106]:
# to remove a stopword
nlp.Defaults.stop_words.remove("beyond")

In [107]:
nlp.vocab["beyond"].is_stop = False

In [108]:
nlp.vocab['beyond'].is_stop

False

Vocaulary Matching

In [109]:
import spacy

## Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [110]:
nlp = spacy.load("en_core_web_sm")

In [111]:
from spacy.matcher import Matcher

In [112]:
matcher = Matcher(nlp.vocab)

In [129]:
# SolarPower
pattern1 = [{"LOWER":"solarpower"}]
# Solar-Power
pattern2 = [{"LOWER":"solar"},{"IS_PUNCT":True},{"LOWER":"power"}]
# Solar Power
pattern3 = [{"LOWER":"solar"},{"LOWER":"power"}]


In [130]:
matcher.add("SolarPower",patterns=[pattern1,pattern2,pattern3])

In [133]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-Power is great")

In [134]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [135]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-Power


In [136]:
# Remove a pattern
matcher.remove("SolarPower")

This found both two-word patterns, with and without the hyphen!

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


In [139]:
# solarpower SolarPower
pattern1 = [{"LOWER":"solarpower"}]
# solar.power
pattern2 = [{"LOWER":"solar"},{"IS_PUNCT":True,"OP":"*"},{"LOWER":"power"}]

In [140]:
matcher.add("SolarPower", patterns=[pattern1,pattern2])

In [141]:
doc2 = nlp(u"Solar--Power is solarPower!!!")

In [142]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


Phrase Matching

In [143]:
from spacy.matcher import PhraseMatcher

In [144]:
matcher = PhraseMatcher(nlp.vocab)

In [145]:
with open("TextFiles\\reaganomics.txt") as f:
    doc3 = nlp(f.read())

In [147]:
phrase_list =  ["voodoo Economics", "supply-side economics",
                 "trickle-down economics", "free-market economics"]

In [148]:
# convert each phrase to a document object.
phrase_patterns = [nlp(text) for text in phrase_list]

In [149]:
phrase_patterns

[voodoo Economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [151]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [153]:
print(*phrase_patterns)

voodoo Economics supply-side economics trickle-down economics free-market economics


In [155]:
matcher.add("EconMatcher",
            docs=[*phrase_patterns])

In [156]:
found_matches = matcher(doc3)
print(found_matches)

[(3680293220734633682, 41, 45), (3680293220734633682, 49, 53), (3680293220734633682, 61, 65), (3680293220734633682, 673, 677), (3680293220734633682, 2987, 2991)]


In [159]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    # span = doc3[start:end]      # get the matched span
    span = doc3[start-2:end+3] 
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 associated with supply-side economics, referred to
3680293220734633682 EconMatcher 49 53 to as trickle-down economics or voodoo economics
3680293220734633682 EconMatcher 61 65 , and free-market economics by political advocates
3680293220734633682 EconMatcher 673 677 from the supply-side economics movement, which
3680293220734633682 EconMatcher 2987 2991 as "trickle-down economics", due
