In [36]:
# !python -m spacy download en_core_web_sm

In [4]:
import spacy

In [35]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [44]:
from spacy import displacy
displacy.render(doc, jupyter=True, options={'distance': 100})

In [47]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
en_model = spacy.load('en_core_web_md')

In [7]:
 sentence = ("In 1541 Desoto wrote in his journal that the Pascagoula people " +
"ranged as far north as the confluence of the Leaf and Chickasawhay rivers at 30.4, -88.5.")
parsed_sent = en_model(sentence)
parsed_sent.ents

(1541, Desoto, Pascagoula, Leaf, Chickasawhay, 30.4)

In [11]:
' '.join(['{}<-{}'.format(tok, tok.tag_) for tok in parsed_sent])

'In<-IN 1541<-CD Desoto<-NNP wrote<-VBD in<-IN his<-PRP$ journal<-NN that<-IN the<-DT Pascagoula<-NNP people<-NNS ranged<-VBD as<-RB far<-RB north<-RB as<-IN the<-DT confluence<-NN of<-IN the<-DT Leaf<-NNP and<-CC Chickasawhay<-NNP rivers<-VBZ at<-IN 30.4<-CD ,<-, -88.5<-NFP .<-.'

In [12]:
%matplotlib inline
from spacy.displacy import render
sentence = "In 1541 Desoto wrote in his journal about the Pascagoula."
parsed_sent = en_model(sentence)
with open('pascagoula.html', 'w') as f:
    f.write(render(docs=parsed_sent, page=True, options=dict(compact=True)))

In [13]:
import pandas as pd
from collections import OrderedDict

In [14]:
def token_dict(token):
    return OrderedDict(ORTH=token.orth_, LEMMA=token.lemma_, POS=token.pos_, TAG=token.tag_, DEP=token.dep_)

def doc_dataframe(doc):
    return pd.DataFrame([token_dict(tok) for tok in doc])

doc_dataframe(en_model("In 1541 Desoto met the Pascagoula."))

Unnamed: 0,ORTH,LEMMA,POS,TAG,DEP
0,In,in,ADP,IN,prep
1,1541,1541,NUM,CD,pobj
2,Desoto,desoto,PROPN,NNP,nsubj
3,met,meet,VERB,VBD,ROOT
4,the,the,DET,DT,det
5,Pascagoula,pascagoula,PROPN,NNP,dobj
6,.,.,PUNCT,.,punct


In [16]:
pattern = [{'TAG': 'NNP', 'OP': '+'}, 
           {'IS_ALPHA': True, 'OP': '*'}, 
           {'LEMMA': 'meet'}, 
           {'IS_ALPHA': True, 'OP': '*'}, 
           {'TAG': 'NNP', 'OP': '+'}]

In [17]:
from spacy.matcher import Matcher
matcher = Matcher(en_model.vocab)
matcher.add('met', None, pattern)

[(14332210279624491740, 2, 6)]

In [21]:
doc = en_model("In 1541 Desoto met the Pascagoula.")
m = matcher(doc)
m

[(14332210279624491740, 2, 6)]

In [18]:
doc = en_model("October 24: Lewis and Clark met their first Mandan Chief, Big White.")
m = matcher(doc)[0]
m

(14332210279624491740, 3, 11)

In [19]:
doc[m[1]:m[2]]

Lewis and Clark met their first Mandan Chief

In [25]:
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")
matcher(doc)

[(14332210279624491740, 5, 9), (14332210279624491740, 5, 9)]

In [24]:
# Adds an additional pattern without removing the previous pattern. Here 'met' is an arbitrary
pattern = [{'TAG': 'NNP', 'OP': '+'}, 
           {'LEMMA': 'and'}, 
           {'TAG': 'NNP', 'OP': '+'},
           {'IS_ALPHA': True, 'OP': '*'}, 
           {'LEMMA': 'meet'}]
matcher.add('met', None, pattern)
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")
m = matcher(doc)
m

[(14332210279624491740, 5, 9), (14332210279624491740, 5, 9)]

In [23]:
doc[m[-1][1]:m[-1][2]]

Gorbachev and Reagan met