In [1]:
import spacy


In [2]:
nlp=spacy.load('en_core_web_sm')

In [3]:
doc=nlp("Apple is looking at buying U.K. startup for $1 billion")

In [4]:
# pos tagging

In [5]:
for tokens in doc:
    print(tokens.text, tokens.pos_, tokens.lemma_, tokens.dep_, tokens.is_alpha, tokens.is_stop, tokens.shape_)

Apple PROPN Apple nsubj True False Xxxxx
is AUX be aux True True xx
looking VERB look ROOT True False xxxx
at ADP at prep True True xx
buying VERB buy pcomp True False xxxx
U.K. PROPN U.K. dobj False False X.X.
startup NOUN startup advcl True False xxxx
for ADP for prep True True xxx
$ SYM $ quantmod False False $
1 NUM 1 compound False False d
billion NUM billion pobj True False xxxx


In [6]:
from spacy import displacy

In [7]:
import spacy

In [8]:
nlp=spacy.load('en_core_web_sm')

In [10]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [11]:
doc=nlp('I was reading the paper.')

In [14]:
for token in doc:
    print(token.morph)

Case=Nom|Number=Sing|Person=1|PronType=Prs
Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
Aspect=Prog|Tense=Pres|VerbForm=Part
Definite=Def|PronType=Art
Number=Sing
PunctType=Peri


In [15]:
for token in doc:
    print(token.morph.get('PronType'))

['Prs']
[]
[]
['Art']
[]
[]


In [16]:
import spacy

nlp = spacy.load("de_core_news_sm")
doc = nlp("Wo bist du?") # English: 'Where are you?'
print(doc[2].morph)  # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
print(doc[2].pos_) # 'PRON'

OSError: [E050] Can't find model 'de_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [17]:
doc=nlp('I was reading text book')

In [18]:
for token in doc:
    print(token.text, token.lemma_)

I I
was be
reading read
text text
book book


In [20]:
nlp=spacy.blank('sv')

In [21]:
nlp.add_pipe('lemmatizer', config={'mode':'lookup'})

<spacy.pipeline.lemmatizer.Lemmatizer at 0x1f7f34e4b40>

In [22]:
nlp=spacy.blank('de')

In [24]:
nlp.add_pipe('morphologizer')

<spacy.pipeline.morphologizer.Morphologizer at 0x1f7f379fd60>

In [27]:
nlp.add_pipe('lemmatizer', config={'mode':'rule'})

<spacy.pipeline.lemmatizer.Lemmatizer at 0x1f7f3b67c40>

In [29]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

In [30]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [33]:
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [32]:
from spacy.symbols import nsubj, VERB

In [34]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

['bright', 'red']
['on']
2
1


In [35]:
doc = nlp("Credit and mortgage account holders must submit their requests")
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
    retokenizer.merge(span)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

Credit and mortgage account holders NOUN nsubj submit
must AUX aux submit
submit VERB ROOT submit
their PRON poss requests
requests NOUN dobj submit


In [36]:
nlp = spacy.load("en_core_web_sm", disable=["parser"])

In [37]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [38]:
doc = nlp("San Francisco considers banning sidewalk delivery robots")


In [40]:
for token in doc.ents:
    print(token.text, token.start_char, token.end_char, token.label_)

San Francisco 0 13 GPE


In [44]:
import numpy 

In [45]:
from spacy.attrs import ENT_IOB, ENT_TYPE

nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents)  # []

header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents)  # [London]

Before ()
After (London,)


In [46]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [47]:
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

['gimme', 'that']
['gim', 'me', 'that']


In [48]:
nlp = spacy.load("en_core_web_sm")
text = "I saw The Who perform. Who did you see?"
doc1 = nlp(text)
print(doc1[2].tag_, doc1[2].pos_)  # DT DET
print(doc1[3].tag_, doc1[3].pos_)  # WP PRON

# Add attribute ruler with exception for "The Who" as NNP/PROPN NNP/PROPN
ruler = nlp.get_pipe("attribute_ruler")
# Pattern to match "The Who"
patterns = [[{"LOWER": "the"}, {"TEXT": "Who"}]]
# The attributes to assign to the matched token
attrs = {"TAG": "NNP", "POS": "PROPN"}
# Add rules to the attribute ruler
ruler.add(patterns=patterns, attrs=attrs, index=0)  # "The" in "The Who"
ruler.add(patterns=patterns, attrs=attrs, index=1)  # "Who" in "The Who"

doc2 = nlp(text)
print(doc2[2].tag_, doc2[2].pos_)  # NNP PROPN
print(doc2[3].tag_, doc2[3].pos_)  # NNP PROPN
# The second "Who" remains unmodified
print(doc2[5].tag_, doc2[5].pos_)  # WP PRON

DT DET
WP PRON
NNP PROPN
NNP PROPN
. PUNCT


In [49]:
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.3104734 True
cat True 7.1956296 True
banana True 6.2522683 True
afskfsd True 7.4449363 True
