In [1]:
!pip install -U spaCy
!python -m spacy download en_core_web_lg

Collecting spaCy
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting weasel<0.4.0,>=0.1.0 (from spaCy)
  Downloading weasel-0.3.4-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting cloudpathlib<0.17.0,>=0.7.0 (from weasel<0.4.0,>=0.1.0->spaCy)
  Downloading cloudpathlib-0.16.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cloudpathlib, weasel, spaCy
  Attempting uninstall: spaCy
    Found existing installation: spacy 3.6.1
    Uninstalling spacy-3.6.1:
      Successfully uninstalled spacy-3.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages t

# spaCy tutorial

*Notice that the installation doesn’t automatically download the English model. We need to do that ourselves. (python -m spacy download en_core_web_lg)*

Hello World in spaCy

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg')
doc = nlp('Hello World!')
for token in doc:
    print(token.text)

Hello
World
!


spaCy preserves this “link” between the word and its place in the raw text. Here’s how to get the exact index of a word:

In [3]:
for token in doc:
    print(token.text + ' ', token.idx)

Hello  0
World  6
!  11


The **Token** class exposes a lot of word-level attributes. Here are a few examples:

In [4]:
doc = nlp("Next week I'll be in Rome.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Next	0	next	False	False	Xxxx	ADJ	JJ
week	5	week	False	False	xxxx	NOUN	NN
I	10	I	False	False	X	PRON	PRP
'll	11	will	False	False	'xx	AUX	MD
be	15	be	False	False	xx	AUX	VB
in	18	in	False	False	xx	ADP	IN
Rome	21	Rome	False	False	Xxxx	PROPN	NNP
.	25	.	True	False	.	PUNCT	.


## Sentence detection
Here’s how to achieve one of the most common NLP tasks with spaCy:

In [5]:
doc = nlp("These are apples. These are oranges.")

for sent in doc.sents:
    print(sent)

These are apples.
These are oranges.


## Part Of Speech Tagging
PoS-tagging of a sentence:

In [6]:
doc = nlp("Next week I'll be in Madrid.")
print([(token.text, token.pos_) for token in doc])

[('Next', 'ADJ'), ('week', 'NOUN'), ('I', 'PRON'), ("'ll", 'AUX'), ('be', 'AUX'), ('in', 'ADP'), ('Madrid', 'PROPN'), ('.', 'PUNCT')]


## Named Entity Recognition
Doing NER with spaCy is super easy and the pretrained model performs pretty well:

In [7]:
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Next week DATE
Madrid GPE


You can also view the IOB style tagging of the sentence like this:

In [8]:
doc = nlp("Next week I'll be in Madrid.")
iob_tagged = [
    (
        token.text,
        token.tag_,
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]
print(iob_tagged)

[('Next', 'JJ', 'B-DATE'), ('week', 'NN', 'I-DATE'), ('I', 'PRP', 'O'), ("'ll", 'MD', 'O'), ('be', 'VB', 'O'), ('in', 'IN', 'O'), ('Madrid', 'NNP', 'B-GPE'), ('.', '.', 'O')]


The spaCy NER also has a healthy variety of entities. You can view the full list here: https://spacy.io/usage/linguistic-features#entity-types

In [9]:
doc = nlp("I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ")
for ent in doc.ents:
    print(ent.text, ent.label_)

2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


Let’s use displaCy to view a beautiful visualization of the Named Entity annotated sentence:

In [10]:
from spacy import displacy

doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

## Chunking
spaCy automatically detects noun-phrases as well:

In [11]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

Wall Street Journal NP Journal
an interesting piece NP piece
crypto currencies NP currencies


Notice how the chunker also computes the root of the phrase, the main word of the phrase.

## Dependency Parsing

Let’s see the dependency parser in action:

In [12]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')

for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/NNP <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


If this doesn’t help visualizing the dependency tree, displaCy comes in handy:

In [13]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

Word Vectors
------------------

spaCy comes shipped with a Word Vector model as well. We’ll need to download a larger model for that: *(python -m spacy download en_core_web_lg)*

The vectors are attached to spaCy objects: Token, Lexeme (a sort of unnatached token, part of the vocabulary), Span and Doc. The multi-token objects average its constituent vectors.

Here are a few properties word vectors have:
1. If two words are similar, they appear in similar contexts
2. Word vectors are computed taking into account the context (surrounding words)
3. Given the two previous observations, similar words should have similar word vectors
4. Using vectors we can derive relationships (relatedness) between words

Let’s see how we can access the embedding of a word in spaCy:

In [14]:
print(nlp.vocab['cat'].vector)

[ 3.7032e+00  4.1982e+00 -5.0002e+00 -1.1322e+01  3.1702e-02 -1.0255e+00
 -3.0870e+00 -3.7327e+00  5.3875e-01  3.5679e+00  6.9276e+00  1.5793e+00
  5.1188e-01  3.1868e+00  6.1534e+00 -4.8941e+00 -2.9959e-01 -3.6276e+00
  2.3825e+00 -1.4402e+00 -4.7577e+00  4.3607e+00 -4.9814e+00 -3.6672e+00
 -1.8052e+00 -2.1888e+00 -4.2875e+00  5.5712e+00 -5.2875e+00 -1.8346e+00
 -2.2015e+00 -7.7091e-01 -4.8260e+00  1.2464e+00 -1.7945e+00 -8.1280e+00
  1.9994e+00  1.1413e+00  3.8032e+00 -2.8783e+00 -4.2136e-01 -4.4177e+00
  7.7456e+00  4.9535e+00  1.7402e+00  1.8275e-01  2.4218e+00 -3.1496e+00
 -3.8057e-02 -2.9818e+00  8.3396e-01  1.1531e+01  3.5684e+00  2.5970e+00
 -2.8438e+00  3.2755e+00  4.5674e+00  3.2219e+00  3.4206e+00  1.1200e-01
  1.0303e-01 -5.8396e+00  4.6370e-01  2.7750e+00 -5.3713e+00 -5.0247e+00
 -2.0212e+00  5.8772e-01  1.1569e+00  1.3224e+00  4.3994e+00  2.0444e+00
  2.1343e+00 -1.9023e+00  2.1469e+00 -2.9085e+00  4.8429e-01 -3.3544e-01
  1.4484e+00 -1.5770e+00 -1.1307e+00  2.8320e+00  6

There’s a really famous example of word embedding math: "man" - "woman" + "queen" = "king". It sounds pretty crazy to be true, so let’s test that out:

In [15]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector

# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue

    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

['queen', 'man', 'king', 'woman', 'he', 'nothin’', "'cause", "'Cause", 'He', 'That']


Computing Similarity
---------------------------

Based on the word embeddings, spaCy offers a similarity interface for all of it’s building blocks: Token, Span, Doc and Lexeme. Here’s how to use that similarity interface:

In [16]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']

print("sim(dog, animal) =",dog.similarity(animal))
print("sim(dog,fruit) =", dog.similarity(fruit))
print("sim(banana,fruit) = ", banana.similarity(fruit))
print("sim(banana,animal) = ", banana.similarity(animal))

sim(dog, animal) = 0.5192115902900696
sim(dog,fruit) = 0.13643456995487213
sim(banana,fruit) =  0.6650428175926208
sim(banana,animal) =  0.18752224743366241


Let’s now use this technique on entire texts:

In [17]:
target = nlp("Cats are beautiful animals.")

doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")

print(target.similarity(doc1))
print(target.similarity(doc2))
print(target.similarity(doc3))

0.925293344292394
0.9067517259890845
0.9037427153904276


Extending spaCy
----------------------

The entire spaCy architecture is built upon three building blocks: Document (the big encompassing container), Token (most of the time, a word) and Span (set of consecutive Tokens). The extensions you create can add extra functionality to anyone of the these components. There are some examples out there for what you can do. Let’s create an extension ourselves.

### Creating Document level Extension

In [18]:
import spacy
import nltk

from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sentiment_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)

Doc.set_extension('polarity_scores', getter=polarity_scores, force=True)

doc = nlp("I love dogs and cats.")
print(doc._.polarity_scores)


{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6369}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [19]:
doc = nlp("Today is a nice day!!!")
print(doc._.polarity_scores)

{'neg': 0.0, 'neu': 0.449, 'pos': 0.551, 'compound': 0.5684}


In [20]:
doc = nlp("I love my dog, but I hate cats.")
print(doc._.polarity_scores)

{'neg': 0.433, 'neu': 0.343, 'pos': 0.223, 'compound': -0.5346}


One can easily create extensions for every component type. Such extensions only have access to the context of that component. What happens if you need the tokenized text along with the Part-Of-Speech tags. Let’s now build a custom pipeline. Pipelines are another important abstraction of spaCy. The nlp object goes through a list of pipelines and runs them on the document. For example the tagger is ran first, then the parser and ner pipelines are applied on the already POS annotated document. Here’s how the nlp default pipeline structure looks like:

In [21]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7dadb33c5600>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7dadb33c4c40>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7dadb34bd850>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7dadb31cd080>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7dadb31c26c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7dadb34bd8c0>)]


### Creating a custom pipeline

Let’s build a custom pipeline that needs to be applied after the tagger pipeline is ran. We need the POS tags to get the Synset from Wordnet.

In [22]:
from nltk.corpus import wordnet as wn
from spacy.tokens import Token
from spacy.language import Language

def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'

    if tag.startswith('V'):
        return 'v'

    if tag.startswith('J'):
        return 'a'

    if tag.startswith('R'):
        return 'r'

    return None


class WordnetPipeline(object):
    def __init__(self, nlp):
        Token.set_extension('synset', default=None, force=True)

    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue

            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss)

        return doc

nltk.download('wordnet')

@Language.factory("wordnet_pipe")
def wordnet_pipe(nlp, name):
    return WordnetPipeline(nlp)


[nltk_data] Downloading package wordnet to /root/nltk_data...


Setup the new pipeline.

In [23]:
nlp.add_pipe("wordnet_pipe")
doc = nlp("Paris is the awesome capital of France.")

for token in doc:
    print(token.text, "-", token._.synset)

# Let’s see how the pipeline structure looks like
print(nlp.pipeline)

nlp.remove_pipe("wordnet_pipe")

Paris - Synset('paris.n.01')
is - Synset('be.v.01')
the - None
awesome - Synset('amazing.s.02')
capital - Synset('capital.n.01')
of - None
France - Synset('france.n.01')
. - None
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7dadb33c5600>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7dadb33c4c40>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7dadb34bd850>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7dadb31cd080>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7dadb31c26c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7dadb34bd8c0>), ('wordnet_pipe', <__main__.WordnetPipeline object at 0x7dadb31752a0>)]


('wordnet_pipe', <__main__.WordnetPipeline at 0x7dadb31752a0>)