In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u"The quick brown fox jumped over the layz dog's back.")

In [4]:
print(doc.text)

The quick brown fox jumped over the layz dog's back.


In [5]:
print(doc[4])

jumped


In [6]:
print(doc[4].pos_)

VERB


In [7]:
print(doc[4].tag_) #fine-grained tag

VBD


## View token tags
Recall that you can obtain a particular token by its index position.
* To view the coarse POS tag use `token.pos_`
* To view the fine-grained tag use `token.tag_`
* To view the description of either type of tag use `spacy.explain(tag)`

<div class="alert alert-success">Note that `token.pos` and `token.tag` return integer hash values; by adding the underscores we get the text equivalent that lives in **doc.vocab**.</div>

## Coarse-grained Part-of-speech Tags
Every token is assigned a POS Tag from the following list:


<table><tr><th>POS</th><th>DESCRIPTION</th><th>EXAMPLES</th></tr>
    
<tr><td>ADJ</td><td>adjective</td><td>*big, old, green, incomprehensible, first*</td></tr>
<tr><td>ADP</td><td>adposition</td><td>*in, to, during*</td></tr>
<tr><td>ADV</td><td>adverb</td><td>*very, tomorrow, down, where, there*</td></tr>
<tr><td>AUX</td><td>auxiliary</td><td>*is, has (done), will (do), should (do)*</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>*and, or, but*</td></tr>
<tr><td>CCONJ</td><td>coordinating conjunction</td><td>*and, or, but*</td></tr>
<tr><td>DET</td><td>determiner</td><td>*a, an, the*</td></tr>
<tr><td>INTJ</td><td>interjection</td><td>*psst, ouch, bravo, hello*</td></tr>
<tr><td>NOUN</td><td>noun</td><td>*girl, cat, tree, air, beauty*</td></tr>
<tr><td>NUM</td><td>numeral</td><td>*1, 2017, one, seventy-seven, IV, MMXIV*</td></tr>
<tr><td>PART</td><td>particle</td><td>*'s, not,*</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>*I, you, he, she, myself, themselves, somebody*</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>*Mary, John, London, NATO, HBO*</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>*., (, ), ?*</td></tr>
<tr><td>SCONJ</td><td>subordinating conjunction</td><td>*if, while, that*</td></tr>
<tr><td>SYM</td><td>symbol</td><td>*$, %, §, ©, +, −, ×, ÷, =, :), 😝*</td></tr>
<tr><td>VERB</td><td>verb</td><td>*run, runs, running, eat, ate, eating*</td></tr>
<tr><td>X</td><td>other</td><td>*sfpksdpsxmsa*</td></tr>
<tr><td>SPACE</td><td>space</td></tr>

___
## Fine-grained Part-of-speech Tags
Tokens are subsequently given a fine-grained tag as determined by morphology:
<table>
<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>
<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>
<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>
<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>
<tr><td>ADJ</td><td></td><td>PRP\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>
<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>
<tr><td>ADJ</td><td></td><td>WP\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>
<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>
<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>
<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>
<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>
<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>
<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>
<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>
<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>
<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>
<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>
<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>
<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>
<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>
<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>
<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>
<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>
<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>""</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>
<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>
<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>
<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>
<tr><td>SYM</td><td></td><td>\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>
<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>
<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary "be"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>HVS</td><td>forms of "have"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>
<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>
<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>
<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>
<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>
<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>
<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>
<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>
<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>
<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>
<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>
<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>
<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>
</table>

___
## Fine-grained POS Tag Examples
These are some grammatical examples (shown in **bold**) of specific fine-grained tags. We've removed punctuation and rarely used tags:
<table>
<tr><th>POS</th><th>TAG</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>ADJ</td><td>AFX</td><td>affix</td><td>The Flintstones were a **pre**-historic family.</td></tr>
<tr><td>ADJ</td><td>JJ</td><td>adjective</td><td>This is a **good** sentence.</td></tr>
<tr><td>ADJ</td><td>JJR</td><td>adjective, comparative</td><td>This is a **better** sentence.</td></tr>
<tr><td>ADJ</td><td>JJS</td><td>adjective, superlative</td><td>This is the **best** sentence.</td></tr>
<tr><td>ADJ</td><td>PDT</td><td>predeterminer</td><td>Waking up is **half** the battle.</td></tr>
<tr><td>ADJ</td><td>PRP\$</td><td>pronoun, possessive</td><td>**His** arm hurts.</td></tr>
<tr><td>ADJ</td><td>WDT</td><td>wh-determiner</td><td>It's blue, **which** is odd.</td></tr>
<tr><td>ADJ</td><td>WP\$</td><td>wh-pronoun, possessive</td><td>We don't know **whose** it is.</td></tr>
<tr><td>ADP</td><td>IN</td><td>conjunction, subordinating or preposition</td><td>It arrived **in** a box.</td></tr>
<tr><td>ADV</td><td>EX</td><td>existential there</td><td>**There** is cake.</td></tr>
<tr><td>ADV</td><td>RB</td><td>adverb</td><td>He ran **quickly**.</td></tr>
<tr><td>ADV</td><td>RBR</td><td>adverb, comparative</td><td>He ran **quicker**.</td></tr>
<tr><td>ADV</td><td>RBS</td><td>adverb, superlative</td><td>He ran **fastest**.</td></tr>
<tr><td>ADV</td><td>WRB</td><td>wh-adverb</td><td>**When** was that?</td></tr>
<tr><td>CONJ</td><td>CC</td><td>conjunction, coordinating</td><td>The balloon popped **and** everyone jumped.</td></tr>
<tr><td>DET</td><td>DT</td><td>determiner</td><td>**This** is **a** sentence.</td></tr>
<tr><td>INTJ</td><td>UH</td><td>interjection</td><td>**Um**, I don't know.</td></tr>
<tr><td>NOUN</td><td>NN</td><td>noun, singular or mass</td><td>This is a **sentence**.</td></tr>
<tr><td>NOUN</td><td>NNS</td><td>noun, plural</td><td>These are **words**.</td></tr>
<tr><td>NOUN</td><td>WP</td><td>wh-pronoun, personal</td><td>**Who** was that?</td></tr>
<tr><td>NUM</td><td>CD</td><td>cardinal number</td><td>I want **three** things.</td></tr>
<tr><td>PART</td><td>POS</td><td>possessive ending</td><td>Fred**'s** name is short.</td></tr>
<tr><td>PART</td><td>RP</td><td>adverb, particle</td><td>Put it **back**!</td></tr>
<tr><td>PART</td><td>TO</td><td>infinitival to</td><td>I want **to** go.</td></tr>
<tr><td>PRON</td><td>PRP</td><td>pronoun, personal</td><td>**I** want **you** to go.</td></tr>
<tr><td>PROPN</td><td>NNP</td><td>noun, proper singular</td><td>**Kilroy** was here.</td></tr>
<tr><td>PROPN</td><td>NNPS</td><td>noun, proper plural</td><td>The **Flintstones** were a pre-historic family.</td></tr>
<tr><td>VERB</td><td>MD</td><td>verb, modal auxiliary</td><td>This **could** work.</td></tr>
<tr><td>VERB</td><td>VB</td><td>verb, base form</td><td>I want to **go**.</td></tr>
<tr><td>VERB</td><td>VBD</td><td>verb, past tense</td><td>This **was** a sentence.</td></tr>
<tr><td>VERB</td><td>VBG</td><td>verb, gerund or present participle</td><td>I am **going**.</td></tr>
<tr><td>VERB</td><td>VBN</td><td>verb, past participle</td><td>The treasure was **lost**.</td></tr>
<tr><td>VERB</td><td>VBP</td><td>verb, non-3rd person singular present</td><td>I **want** to go.</td></tr>
<tr><td>VERB</td><td>VBZ</td><td>verb, 3rd person singular present</td><td>He **wants** to go.</td></tr>
</table>

In [8]:
print(doc[4].pos_)

VERB


In [9]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
layz       PROPN      NNP        noun, proper singular
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [10]:
doc = nlp(u"I read books on NLP")

In [11]:
word = doc[1]

In [12]:
word.text

'read'

In [13]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [14]:
doc = nlp(u"I read a book on NLP")

In [15]:
word = doc[1]
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [16]:
doc = nlp(u"The quick brown fox jumped over the layz dog's back.")

In [17]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [18]:
POS_counts

{90: 2, 84: 2, 92: 3, 100: 1, 85: 1, 96: 1, 94: 1, 97: 1}

In [19]:
doc.vocab[84].text

'ADJ'

In [20]:
doc[2].pos

84

In [21]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   2
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
96. PROPN 1
97. PUNCT 1
100. VERB  1


In [22]:
TAG_counts= doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{10}} {v}")

74. POS        1
1292078113972184607. IN         1
10554686591937588953. JJ         2
12646065887601541794. .          1
15267657372422890137. DT         2
15308085513773655218. NN         3
15794550382381185553. NNP        1
17109001835818727656. VBD        1


In [23]:
len(doc.vocab)

793

In [24]:
DEP_counts= doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{10}} {v}")

402. amod       2
415. det        2
429. nsubj      1
439. pobj       1
440. poss       1
443. prep       1
445. punct      1
7037928807040764755. compound   1
8110129090154140942. case       1
8206900633647566924. ROOT       1


# Visualizing POS

In [25]:
doc = nlp(u"The quick brown fox jumped over the layz dog.")

In [26]:
from spacy import displacy

In [27]:
displacy.render(doc,style="dep",jupyter = True)

In [28]:
options = {"distance":110,"compact":"True","color":"yellow","bg":"#09a3d5","font":"Times"} #bg background

In [29]:
displacy.render(doc,style="dep",jupyter = True,options=options)

In [30]:
doc2 = nlp(u"This is a sentence. This is another sentence, possibly longer than other")

In [31]:
spans = list(doc2.sents)

In [32]:
# displacy.serve(spans,style="dep",options={"distance":110})

# Named Entity Recognition

In [33]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text +"-"+ ent.label_ +"-"+ str(spacy.explain(ent.label_)))
    else:
        print("no entities found")

In [34]:
doc = nlp(u"Hi how are you?")

In [35]:
show_ents(doc)

no entities found


In [36]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Manument?")

In [37]:
show_ents(doc)

Washington, DC-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the Washington Manument-ORG-Companies, agencies, institutions, etc.


In [38]:
doc = nlp(u"Can I have 500 dollars of Microsoft stock?")

In [39]:
show_ents(doc)

500 dollars-MONEY-Monetary values, including unit
Microsoft-ORG-Companies, agencies, institutions, etc.


## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [40]:
doc = nlp(u"Tesla to build a U.K. factory for $6 million")

In [41]:
show_ents(doc)

U.K.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [42]:
from spacy.tokens import Span

In [43]:
ORG = doc.vocab.strings[u"ORG"]

In [44]:
ORG

383

In [45]:
new_ent = Span(doc,0,1,label = ORG) # tesla is a org

In [46]:
doc.ents = list(doc.ents) + [new_ent]

In [47]:
show_ents(doc)

Tesla-ORG-Companies, agencies, institutions, etc.
U.K.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [48]:
doc = nlp(u"Our company created a brand new vacuum cleaner. "
         u"This new vacuum-cleaner is the best in show.")

In [49]:
show_ents(doc)

no entities found


In [50]:
from spacy.matcher import PhraseMatcher

In [51]:
matcher = PhraseMatcher(nlp.vocab)

In [52]:
phrase_list = ["vacuum cleaner", "vacuum-cleaner"]

In [53]:
phrase_patterns = [nlp(text) for text in phrase_list] 

In [54]:
matcher.add("newproduct",None,*phrase_patterns)

In [55]:
found_matches=matcher(doc)

In [56]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [57]:
from spacy.tokens import Span

In [58]:
# product -> prod 
PROD = doc.vocab.strings[u"PRODUCT"]

In [59]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [60]:
new_ents = [Span(doc,match[1],match[2],label = PROD) for match in found_matches]

In [61]:
doc.ents = list(doc.ents)+ new_ents

In [62]:
show_ents(doc)

vacuum cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
vacuum-cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)


In [63]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but  now it is marked down by 10 dollars.")

In [64]:
#how many times mention about money?

In [65]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10 dollars]

In [66]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

### Visulization NER

In [67]:
from spacy import displacy

In [68]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of 6$ million"
         u"By constrast, Sony only sold 8 thousand Walkman music players.")

In [69]:
displacy.render(doc,style="ent",jupyter= True)

In [70]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style = "ent",jupyter = True)

In [71]:
colors = {"ORG":"red"}
options = {"ents":["PRODUCT","ORG"],"colors":colors}

In [72]:
displacy.render(doc,style="ent",jupyter= True,options = options)

In [73]:
# didn't recognize walkman as a product

## Sentence Segmentation

In [74]:
doc = nlp(u"This is the first sentence. This is another sentence. This is last sentence.")

In [75]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is last sentence.


In [85]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [86]:
doc[0]

"

In [87]:
list(doc.sents)[0]

"Management is doing the right things; leadership is doing the right things."

In [88]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [89]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [90]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [91]:
for sent in doc.sents:
    print(sent)
    print("\n")

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [92]:
# Add a segmentation rule

def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print("\t")
        print(token.i)



In [93]:
set_custom_boundaries(doc)

"
	
0
Management
	
1
is
	
2
doing
	
3
the
	
4
right
	
5
things
	
6
;
	
7
leadership
	
8
is
	
9
doing
	
10
the
	
11
right
	
12
things
	
13
.
	
14
"
	
15
-Peter
	
16
Drucker
	
17


In [94]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [105]:
from spacy_langdetect import LanguageDetector

ModuleNotFoundError: No module named 'spacy_langdetect'

In [103]:
@Language.component("boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i+1].is_sent_start = True
    return doc

NameError: name 'Language' is not defined

In [104]:
nlp.create_pipe(boundaries,name="phildawg",before = "parser")

NameError: name 'boundaries' is not defined

In [96]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [97]:
# change segmentation rules

