In [1]:
###
# Spacy experimentation
# https://colab.research.google.com/github/explosion/spacy-transformers/blob/master/examples/Spacy_Transformers_Demo.ipynb#scrollTo=bWDTiDUo16I5
###

In [23]:
import numpy as np
import pandas as pd
import datetime as dt
import timeit
import itertools
import matplotlib.pyplot as plt
#import cupy

import spacy
import en_trf_xlnetbasecased_lg
from spacy.pipeline import EntityRuler

In [3]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    print("Using GPU!")
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()
else:
    print('gpu not used')

gpu not used


In [24]:
test_text = "This downloads the English models for the neural pipeline.  \
            Apple is opening its first big office in San Francisco."

In [48]:
#model = 'en_trf_xlnetbasecased_lg' # required large amount of RAM
model = 'en_core_web_sm' # for testing on laptop
nlp = spacy.load(model)
doc = nlp(test_text)
if model == 'en_trf_xlnetbasecased_lg':
    assert doc.tensor.shape == (7, 768)
    print(doc._.trf_word_pieces_)
else: 
    print(doc.tensor.shape)  # String values of the wordpieces
# The raw transformer output has one row per wordpiece.


(22, 96)


In [49]:
print(doc._.trf_word_pieces)  # Wordpiece IDs (note: *not* spaCy's hash values!)
print(doc._.trf_alignment)  # Alignment between spaCy tokens and wordpieces

None
None


In [50]:
a = [X for X in doc.noun_chunks]
print(a)

a = [X for X in doc.ents]
print(a)

a = [X for X in doc.vector]
print(a)

[the English models, the neural pipeline, Apple, its first big office, San Francisco]
[English, Apple, first, San Francisco]
[0.5665008, -0.81509286, -0.1968553, 1.4986691, 1.6544025, 0.84154737, 0.39333984, -0.71957296, 1.5567929, 1.1163052, 0.3584819, -0.12662949, -0.60008836, -0.43438655, -1.698071, -0.94573015, -0.3395876, 0.62878084, -0.6069204, -0.9008972, -0.017370323, -0.6115438, -0.0033634792, -0.46054646, -0.78127784, 0.29187703, -1.0022246, -0.70720476, 0.10279714, -0.8063633, 0.16629168, 0.38355586, -0.36699972, -0.460046, 0.0022588617, -1.7921323, 1.2108642, -0.12025972, -1.799353, 0.04530349, 2.1237574, -0.15021712, 0.15487348, -1.6605483, 0.09130888, -0.40168554, 0.17516986, 0.047767665, -1.1016526, 0.82798314, 0.44769165, -1.1320049, -0.70517784, -0.23953885, -1.7606286, 0.8487205, 1.0371106, 1.2421204, -0.3026244, -0.6775752, 0.05937457, 0.042447697, 1.299655, 0.7147067, 0.89749414, 1.390488, 0.6627934, -1.7640376, -0.8623455, 0.90369356, 0.27044013, -0.040202726, 0.88

In [51]:
# EXAMPLE OF ADDING A PIPE TO THE SPACY PIPELINE

def test_component(doc):
    print("After tokenization, this doc has {} tokens.".format(len(doc)))
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

nlp.add_pipe(test_component, name="print_info", last=True)
print(nlp.pipe_names)  # ['tagger', 'parser', 'ner', 'print_info']
doc = nlp(test_text)

['tagger', 'parser', 'ner', 'print_info']
After tokenization, this doc has 22 tokens.
The part-of-speech tags are: ['DET', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'SPACE', 'PROPN', 'AUX', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PUNCT']


In [52]:
# EXAMPLE OF USING THE ENTITY RULER

ruler = EntityRuler(nlp)
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "neural"}, {"LOWER": "pipeline"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

After tokenization, this doc has 1 tokens.
The part-of-speech tags are: ['PROPN']
This is a pretty short document.


In [53]:
doc2 = nlp(test_text)

a = [X for X in doc2.noun_chunks]
print(a)

a = [X for X in doc2.ents]
print(a)

After tokenization, this doc has 22 tokens.
The part-of-speech tags are: ['DET', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'SPACE', 'PROPN', 'AUX', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PUNCT']
[the English models, the neural pipeline, Apple, its first big office, San Francisco]
[English, neural pipeline, Apple, first, San Francisco]


In [54]:
print([(ent.text, ent.label_) for ent in doc.ents])
print([(ent.text, ent.label_) for ent in doc2.ents])

[('English', 'LANGUAGE'), ('Apple', 'ORG'), ('first', 'ORDINAL'), ('San Francisco', 'GPE')]
[('English', 'LANGUAGE'), ('neural pipeline', 'GPE'), ('Apple', 'ORG'), ('first', 'ORDINAL'), ('San Francisco', 'GPE')]


In [56]:
# REARRANGE THE PIPELINE SO THE ENTITY RULES OCCURS BEFORE THE NER

nlp.remove_pipe(name='entity_ruler')

nlp.add_pipe(ruler, before='ner')

In [57]:
doc3 = nlp(test_text)

print([(ent.text, ent.label_) for ent in doc.ents])
print([(ent.text, ent.label_) for ent in doc2.ents])
print([(ent.text, ent.label_) for ent in doc3.ents])

After tokenization, this doc has 22 tokens.
The part-of-speech tags are: ['DET', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'SPACE', 'PROPN', 'AUX', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PUNCT']
[('English', 'LANGUAGE'), ('Apple', 'ORG'), ('first', 'ORDINAL'), ('San Francisco', 'GPE')]
[('English', 'LANGUAGE'), ('neural pipeline', 'GPE'), ('Apple', 'ORG'), ('first', 'ORDINAL'), ('San Francisco', 'GPE')]
[('English', 'LANGUAGE'), ('neural pipeline', 'GPE'), ('Apple', 'ORG'), ('first', 'ORDINAL'), ('San Francisco', 'GPE')]
