In [None]:
!pip install spacy-transformers
!python -m spacy download en_trf_bertbaseuncased_lg

In [8]:
import random
import numpy as np

import spacy
from spacy.util import minibatch
import torch

from numpy.testing import assert_almost_equal

In [2]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

In [4]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [None]:
assert doc.tensor.shape == (7,768) # Always ensures one row per token
doc._.trf_word_pieces_  # String values of the wordpieces
doc._.trf_word_pieces  # Wordpiece IDs (note: *not* spaCy's hash values!)
doc._.trf_alignment  # Alignment between spaCy tokens and wordpieces
# The raw transformer output has one row per wordpiece.
assert len(doc._.trf_last_hidden_state) == len(doc._.trf_word_pieces)
# To avoid losing information, we calculate the doc.tensor attribute such that
# the sum-pooled vectors match (apart from numeric error)
assert_almost_equal(doc.tensor.sum(axis=0), doc._.trf_last_hidden_state.sum(axis=0), decimal=5)
span = doc[2:4]
# Access the tensor from Span elements (especially helpful for sentences)
assert np.array_equal(span.tensor, doc.tensor[2:4])

In [6]:
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  
print(apple1[0].similarity(apple3[0])) 

0.7342856
0.43365774


# Transfer Learning

In [7]:
TRAIN_DATA = [
    ("text1", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}})
]

In [9]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")
print(nlp.pipe_names) # ["sentencizer", "trf_wordpiecer", "trf_tok2vec"]
textcat = nlp.create_pipe("trf_textcat", config={"exclusive_classes": True})
for label in ("POSITIVE", "NEGATIVE"):
    textcat.add_label(label)
nlp.add_pipe(textcat)

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


In [10]:
optimizer = nlp.resume_training()
for i in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for batch in minibatch(TRAIN_DATA, size=8):
        texts, cats = zip(*batch)
        nlp.update(texts, cats, sgd=optimizer, losses=losses)
    print(i, losses)
nlp.to_disk("/bert-textcat")

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


0 {'trf_textcat': 0.5}
1 {'trf_textcat': 0.36813753843307495}
2 {'trf_textcat': 0.26624801754951477}
3 {'trf_textcat': 0.08583438396453857}
4 {'trf_textcat': 0.01652258075773716}
5 {'trf_textcat': 0.0031398902647197247}
6 {'trf_textcat': 0.0007923889206722379}
7 {'trf_textcat': 0.0002619001315906644}
8 {'trf_textcat': 6.061859312467277e-05}
9 {'trf_textcat': 1.165224784926977e-05}
