In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm') # small 12 MB, medium, large 900 MB

In [5]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 Billion')

In [6]:
doc

Tesla is looking at buying U.S. startup for $6 Billion

In [10]:
for token in doc:
    print(token.text, ' > ' ,token.pos_, ' > ', token.dep_)

Tesla  >  PROPN  >  nsubj
is  >  AUX  >  aux
looking  >  VERB  >  ROOT
at  >  ADP  >  prep
buying  >  VERB  >  pcomp
U.S.  >  PROPN  >  compound
startup  >  NOUN  >  dobj
for  >  ADP  >  prep
$  >  SYM  >  quantmod
6  >  NUM  >  compound
Billion  >  NUM  >  pobj


# Spacy pipelines

https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg

<img src='https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg'>

In [11]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x125cbafd0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1237fc130>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1237fc1a0>)]

In [12]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [19]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE 
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [20]:
doc2[0].pos_

'PROPN'

# Dependency Parsing

https://nlp.stanford.edu/software/dependencies_manual.pdf

In [21]:
spacy.explain('PROPN')

'proper noun'

In [23]:
spacy.explain('advmod')

'adverbial modifier'

In [25]:
print(doc2[4])
print(doc2[4].lemma_)

looking
look


# Span

In [27]:
doc3 = nlp(u'The Stanford typed dependencies representation was designed to provide a simple description of the grammatical relationships in a sentence that can easily be understood and effectively used by people without linguistic expertise who want to extract textual relations. In particular, rather than the phrase structure representations that have long dominated in the computational linguistic community, it represents all sentence relationships uniformly as typed dependency relations')

In [28]:
qq = doc3[16:30]

In [29]:
qq

in a sentence that can easily be understood and effectively used by people without

In [30]:
type(qq)

spacy.tokens.span.Span

# Sentences

In [31]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence. Will spacy be able to tokenize all these sentences correctly?')

In [32]:
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.
Will spacy be able to tokenize all these sentences correctly?


In [36]:
doc4[6].is_sent_start

True