`REMEMBER` Ctrl+Shift+P in notebooks to bring up finder 

In [5]:
# !python -m spacy download en_trf_distilbertbaseuncased_lg

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import torch

torch.cuda.device(0)
print(torch.cuda.get_device_name(0))
print(f'pytorch {torch.__version__}, spacy {spacy.__version__}')

%matplotlib inline

GeForce GTX 1050 Ti
pytorch 1.4.0, spacy 2.2.3


In [18]:
nlp = spacy.load('en_trf_distilbertbaseuncased_lg')
doc = nlp("Apple shares rose on the news. Apple pie is delicious.")
print(doc[0].similarity(doc[7]))
print(doc._.trf_last_hidden_state.shape)

0.77565545
(16, 768)


### Transfer learning
The main use case for pretrained transformer models is transfer learning. You load in a large generic model pretrained on lots of text, and start training on your smaller dataset with labels specific to your problem. 

In [23]:
import spacy
import GPUtil
import torch
import numpy
from numpy.testing import assert_almost_equal
from scipy.spatial import distance
import cupy
import numpy as np

In [24]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    print("Using GPU!")
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()

Using GPU!
GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  3% | 63% |


BERT uses wordpieces (e.g. playing -> play + ##ing) instead of words. This is effective in reducing the size of the vocabulary and increases the amount of data that is available for each word.

In [34]:

nlp = spacy.load("en_trf_distilbertbaseuncased_lg")
doc = nlp("Here is some text to encode.")

assert doc.tensor.shape == (7, 768)  # Always has one row per token
print(doc._.trf_word_pieces_)        # String values of the wordpieces


['[CLS]', 'here', 'is', 'some', 'text', 'to', 'en', '##code', '.', '[SEP]']


The raw transformer output has one row per wordpiece.

In [27]:
print(doc._.trf_word_pieces)  # Wordpiece IDs (note: *not* spaCy's hash values!)
print(doc._.trf_alignment)    # Alignment between spaCy tokens and wordpieces

[101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 1012, 102]
[[1], [2], [3], [4], [5], [6, 7], [8]]


# Build a Sentiment Classifier using spaCy Transformers


In [2]:
import thinc
import random
import spacy
import GPUtil
import torch
from spacy.util import minibatch
from tqdm.auto import tqdm
import unicodedata
import wasabi
import numpy
from collections import Counter

In [3]:
spacy.util.fix_random_seed(0)
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()

GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 11% | 17% |


Load IMDB movie database for sentiment analysis

In [4]:
def _prepare_partition(text_label_tuples, *, preprocess=False):
    texts, labels = zip(*text_label_tuples)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    return texts, cats

def load_data(*, limit=0, dev_size=2000):
    """Load data from the IMDB dataset, splitting off a held-out set."""
    if limit != 0:
        limit += dev_size
    assert dev_size != 0
    train_data, _ = thinc.extra.datasets.imdb(limit=limit)
    assert len(train_data) > dev_size
    random.shuffle(train_data)
    dev_data = train_data[:dev_size]
    train_data = train_data[dev_size:]
    train_texts, train_labels = _prepare_partition(train_data, preprocess=False)
    dev_texts, dev_labels = _prepare_partition(dev_data, preprocess=False)
    return (train_texts, train_labels), (dev_texts, dev_labels)

In [None]:
(train_texts, train_cats), (eval_texts, eval_cats) = load_data()

##### `Models`  
en_trf_distilbertbaseuncased_lg  
en_trf_xlnetbasecased_lg  
en_trf_bertbaseuncased_lg  

In [None]:
# !python -m spacy download en_trf_xlnetbasecased_lg

In [None]:
## TODO Try with distilbert
model_choice = "en_trf_distilbertbaseuncased_lg" 

model_choice = "en_trf_xlnetbasecased_lg"

In [None]:

nlp = spacy.load(model_choice)

textcat = nlp.create_pipe("trf_textcat", config={"architecture": "softmax_class_vector"})

In [15]:
nlp = spacy.load(model_choice)

print(nlp.pipe_names)
print(f"Loaded model '{model_choice}'")
if model_choice == "en_trf_xlnetbasecased_lg":
    textcat = nlp.create_pipe(
          "trf_textcat", config={"architecture": "softmax_class_vector"}
      )
elif model_choice == "en_trf_bertbaseuncased_lg":
    textcat = nlp.create_pipe(
          "trf_textcat", config={"architecture": "softmax_class_vector"}
      )
elif model_choice == "en_trf_distilbertbaseuncased_lg":
    textcat = nlp.create_pipe(
          "trf_textcat", config={"architecture": "softmax_class_vector"}
      )
else: 
    print("Choose a supported transformer model")

OSError: [E050] Can't find model 'en_trf_xlnetbasecased_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
# add label to text classifier
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

In [None]:
print("Labels:", textcat.labels)
nlp.add_pipe(textcat, last=True)
print(f"Using {len(train_texts)} training docs, {len(eval_texts)} evaluation")

In [None]:
# total_words = sum(len(text.split()) for text in train_texts)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

In [None]:

n_iter=4
n_texts=75     # Changed number of texts from 1000 to 75 to relieve pressue on GPU memory
batch_size=4   # batch-szie changed from 8 to 4 to relieve pressure on GPU memory
learn_rate=2e-5
max_wpb=1000
pos_label="POSITIVE"


In [None]:
def cyclic_triangular_rate(min_lr, max_lr, period):
    it = 1
    while True:
        # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
        cycle = numpy.floor(1 + it / (2 * period))
        x = numpy.abs(it / period - 2 * cycle + 1)
        relative = max(0, 1 - x)
        yield min_lr + (max_lr - min_lr) * relative
        it += 1

In [None]:
def evaluate(nlp, texts, cats, pos_label):
    tp = 0.0  # True positives
    fp = 0.0  # False positives
    fn = 0.0  # False negatives
    tn = 0.0  # True negatives
    total_words = sum(len(text.split()) for text in texts)
    with tqdm(total=total_words, leave=False) as pbar:
        for i, doc in enumerate(nlp.pipe(texts, batch_size=batch_size)):
            gold = cats[i]
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if label != pos_label:
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.0
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.0
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
            pbar.update(len(doc.text.split()))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [None]:

# Initialize the TextCategorizer, and create an optimizer.
optimizer = nlp.resume_training()
optimizer.alpha = 0.001
optimizer.trf_weight_decay = 0.005
optimizer.L2 = 0.0
learn_rates = cyclic_triangular_rate(
    learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))

pbar = tqdm(total=100, leave=False)
results = []
epoch = 0
step = 0
eval_every = 100
patience = 3
while True:
    # Train and evaluate
    losses = Counter()
    random.shuffle(train_data)
    batches = minibatch(train_data, size=batch_size)
    for batch in batches:
        optimizer.trf_lr = next(learn_rates)
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
        pbar.update(1)
        if step and (step % eval_every) == 0:
            pbar.close()
            with nlp.use_params(optimizer.averages):
                scores = evaluate(nlp, eval_texts, eval_cats, pos_label)
            results.append((scores["textcat_f"], step, epoch))
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                    losses["trf_textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )
            pbar = tqdm(total=eval_every, leave=False)
        step += 1
    epoch += 1
    print(f"epoch {epoch}")
    # Stop if no improvement in HP.patience checkpoints
    if results:
        best_score, best_step, best_epoch = max(results)
        print(f"best score: {best_score}  best_step : {best_step}  best epoch : {best_epoch} ")
        print(f"break clause: {((step - best_step) // eval_every)}")
        if ((step - best_step) // eval_every) >= patience:
            break

    msg = wasabi.Printer()
    table_widths = [2, 4, 6]
    msg.info(f"Best scoring checkpoints")
    msg.row(["Epoch", "Step", "Score"], widths=table_widths)
    msg.row(["-" * width for width in table_widths])
    for score, step, epoch in sorted(results, reverse=True)[:10]:
        msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths)

    # Test the trained model
    test_text = eval_texts[0]
    doc = nlp(test_text)
    print(test_text, doc.cats)

**Sources & More information**  
*XL-Net explanation*  
https://mlexplained.com/2019/06/30/paper-dissected-xlnet-generalized-autoregressive-pretraining-for-language-understanding-explained/  
Attention is all you need  
https://arxiv.org/abs/1706.03762  