### Spacy Basics

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm') # loading language library (model)

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million') # unicode string

In [4]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_) 
    # pos -> Part Of Speech
    # dep -> syntactic dependency

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x289087d8f40>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x289087d8d60>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x289086e4cf0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2890898c480>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x289089e6ac0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x289086e4f20>)]

In [None]:
nlp.pipe_names

In [None]:
doc2 = nlp(u"Tesla isn't        looking into startups anymore.")

In [None]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

In [None]:
doc2[0], doc2[0].pos_

In [None]:
spacy.explain('PROPN'), spacy.explain('nsubj')

###### Additional Token Attributes

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [None]:
# Lemmas (the base form of the word):
print(doc2[4].text)
print(doc2[4].lemma_)

In [None]:
# Simple Parts-of-Speech & Detailed Tags:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

In [None]:
# Word Shapes:
print(doc2[0].text+': '+doc2[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)

In [None]:
# Boolean Values:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)

In [None]:
doc2[3], doc2[3].text

In [None]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [None]:
# span is a slice of a doc object
life_quote = doc3[16:30]
print(life_quote)

In [None]:
type(doc3), type(life_quote)

In [None]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [None]:
for sent in doc4.sents:
    print(sent)

In [None]:
doc4[6], doc4[6].is_sent_start

In [None]:
doc4[8], doc4[8].is_sent_start

### Tokenization

-  **Prefix**:	Character(s) at the beginning &#9656; `$ ( “ ¿`
-  **Suffix**:	Character(s) at the end &#9656; `km ) , . ! ”`
-  **Infix**:	Character(s) in between &#9656; `- -- / ...`
-  **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied &#9656; `St. U.S.`

In [None]:
mystring = '"We\'re moving to L.A.!"'

In [None]:
mystring

In [None]:
doc = nlp(mystring)

In [None]:
for token in doc:
    print(token.text)

In [None]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

In [None]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

In [None]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

In [None]:
len(doc4) # number of tokens

In [None]:
# when we load it up en_core_web_sm that has a vocabulary of 794 different types of tokens
len(doc4.vocab)

In [None]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2]

In [None]:
# Retrieve three tokens from the middle:
doc5[2:5]

In [None]:
# Retrieve the last four tokens:
doc5[-4:]

In [None]:
doc6 = nlp(u'My dinner was horrible.')
doc7 = nlp(u'Your dinner was delicious.')

In [None]:
doc6[3] = doc7[3]

In [None]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

# named entities
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

In [None]:
# Noun chunks -> Noun + the words describing the noun

doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

In [None]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

In [None]:
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")

for chunk in doc11.noun_chunks:
    print(chunk.text)

In [None]:
from spacy import displacy

In [None]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [None]:
displacy.render(doc, style="dep", jupyter=True, options={'distance': 125})

In [None]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [None]:
displacy.render(doc, style="ent", jupyter=True, options={'distance': 125})

### Stemming

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
p_stemmer = PorterStemmer()

In [None]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [None]:
for word in words:
    print(word + ' ------> ' + p_stemmer.stem(word))

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language='english')

In [None]:
for word in words:
    print(word + ' ------> ' + s_stemmer.stem(word))

In [None]:
words = ['generous', 'generously', 'generation', 'generate']
for word in words:
    print(word + ' ------> ' + s_stemmer.stem(word))

In [None]:
phrase = 'I am meeting him tomorrow at the meeting'
for word in phrase.split():
    print(word+' ------> '+p_stemmer.stem(word))

### Lemmatization

In [None]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

In [None]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [None]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

In [None]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")

show_lemmas(doc3)

In [None]:
doc4 = nlp(u"That's an enormous automobile")

show_lemmas(doc4)

### Stop Words

In [None]:
print(nlp.Defaults.stop_words)

In [None]:
len(nlp.Defaults.stop_words)

In [None]:
nlp.vocab['myself'].is_stop

In [None]:
nlp.vocab['mystery'].is_stop

In [None]:
nlp.vocab['is']

In [None]:
nlp.Defaults.stop_words.add('btw')

In [None]:
nlp.vocab['btw'].is_stop = True

In [None]:
nlp.Defaults.stop_words.remove('beyond')

In [None]:
nlp.vocab['beyond'].is_stop = False

### Phrase Matching and  Vocabulary 

In [15]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [4]:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}] # transform word to lowercase and then does it match 'solarpower' ?

# Solar Power
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

# Solar-Power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', [pattern1, pattern2, pattern3])

In [5]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity. Solar--power is solarpower yay!')

In [6]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16), (8656102463236116519, 21, 24), (8656102463236116519, 25, 26)]


In [7]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power
8656102463236116519 SolarPower 21 24 Solar--power
8656102463236116519 SolarPower 25 26 solarpower


The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


In [8]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

matcher.remove('SolarPower')

matcher.add('SolarPower', [pattern1, pattern2])

In [9]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16), (8656102463236116519, 21, 24), (8656102463236116519, 25, 26)]


In [10]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')
found_matches = matcher(doc2)
print(found_matches)

[]


Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [16]:
doc = nlp(u'Tweet #100DaysOfCode')
pattern1 = [{'ORTH': '#'}, {}]
matcher.add('HashTags', [pattern1])
found_matches = matcher(doc)
print(found_matches)

[(7125267737722935896, 1, 3)]


In [17]:
from spacy.matcher import PhraseMatcher

In [18]:
matcher = PhraseMatcher(nlp.vocab)

In [19]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [20]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [21]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [22]:
matcher.add('EconMatcher', [*phrase_patterns])

In [24]:
found_matches = matcher(doc3)

In [26]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2991 trickle-down economics


In [27]:
doc3[:70]

REAGANOMICS
https://en.wikipedia.org/wiki/Reaganomics

Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.


In [28]:
doc3[665:685]

same time he attracted a following from the supply-side economics movement, which formed in opposition to Keynesian