In [None]:
# default_exp modeling.token_classification


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# modeling.token_classification

> This module contains custom models, loss functions, custom splitters, etc... for token classification tasks (e.g., Named entity recognition (NER), Part-of-speech tagging (POS), etc...). The objective of token classification is to predict the correct label for each token provided in the input. In the computer vision world, this is akin to what we do in segmentation tasks whereby we attempt to predict the class/label for each pixel in an image.

In [None]:
# export
import os, ast, inspect
from typing import Any, Callable, Dict, List, Optional, Union, Type

from fastcore.all import *
from fastai.callback.all import *
from fastai.data.block import DataBlock, ColReader, ItemGetter, ColSplitter, RandomSplitter
from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.learner import *
from fastai.losses import CrossEntropyLossFlat
from fastai.optimizer import Adam, OptimWrapper, params
from fastai.metrics import perplexity
from fastai.torch_core import *
from fastai.torch_imports import *
from fastprogress.fastprogress import progress_bar, master_bar
from seqeval import metrics as seq_metrics
from transformers import AutoModelForTokenClassification, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR
from blurr.data.core import HF_TextBlock, BlurrDataLoader, get_blurr_tfm, first_blurr_tfm
from blurr.modeling.core import HF_PreCalculatedLoss, Blearner
from blurr.data.token_classification import (
    align_labels_with_tokens,
    align_labels_with_words,
    HF_TokenClassInput,
    HF_TokenTensorCategory,
    HF_TokenCategorize,
    HF_TokenCategoryBlock,
    HF_TokenClassBeforeBatchTransform,
)

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import concatenate_datasets, load_dataset
from fastai.data.external import untar_data, URLs
from fastcore.test import *
from nbverbose.showdoc import show_doc
from transformers import AutoConfig

from blurr.utils import print_versions
from blurr.modeling.core import HF_BaseModelWrapper, HF_BaseModelCallback, HF_PreCalculatedLoss, hf_splitter

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


## Setup

We'll use a subset of `conll2003` to demonstrate how to configure your blurr code for token classification

In [None]:
raw_datasets = load_dataset("conll2003")

labels = raw_datasets["train"].features["ner_tags"].feature.names
print(f'Labels: {labels}')

conll2003_df = pd.DataFrame(raw_datasets["train"])
conll2003_df.head()

In [None]:
model_cls = AutoModelForTokenClassification
pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)

config.num_labels = len(labels)
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls, config=config)
hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

Notice above how I set the `config.num_labels` attribute to the number of labels we want *our* model to be able to predict. The model will update its last layer accordingly (this concept is essentially transfer learning).

In [None]:
test_eq(hf_config.num_labels, len(labels))


In [None]:
before_batch_tfm = HF_TokenClassBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model)
blocks = (HF_TextBlock(before_batch_tfm=before_batch_tfm, input_return_type=HF_TokenClassInput), HF_TokenCategoryBlock(vocab=labels))

dblock = DataBlock(blocks=blocks, get_x=ColReader("tokens"), get_y=ColReader("ner_tags"), splitter=RandomSplitter())


In [None]:
dls = dblock.dataloaders(conll2003_df, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2)


## Metrics

In this section, we'll add helpful metrics for token classification tasks

In [None]:
# export
def calculate_token_class_metrics(pred_toks, targ_toks, metric_key):
    if metric_key == "accuracy":
        return seq_metrics.accuracy_score(targ_toks, pred_toks)
    if metric_key == "precision":
        return seq_metrics.precision_score(targ_toks, pred_toks)
    if metric_key == "recall":
        return seq_metrics.recall_score(targ_toks, pred_toks)
    if metric_key == "f1":
        return seq_metrics.f1_score(targ_toks, pred_toks)

    if metric_key == "classification_report":
        return seq_metrics.classification_report(targ_toks, pred_toks)


### `HF_TokenClassMetricsCallback`

In [None]:
# export
class HF_TokenClassMetricsCallback(Callback):
    """
    A fastai friendly callback that includes accuracy, precision, recall, and f1 metrics using the
    `seqeval` library.  Additionally, this metric knows how to *not* include your 'ignore_token' in it's
    calculations.

    See [here](https://github.com/chakki-works/seqeval) for more information on `seqeval`.
    """

    def __init__(self, tok_metrics=["accuracy", "precision", "recall", "f1"], **kwargs):
        self.run_before = Recorder

        store_attr(self=self, names="tok_metrics, kwargs")
        self.custom_metrics_dict = {k: None for k in tok_metrics}

        self.do_setup = True

    def setup(self):
        # one time setup code here.
        if not self.do_setup:
            return

        # grab the hf_tokenizer from the HF_TokenClassBeforeBatchTransform
        tfm = first_blurr_tfm(self.learn.dls, before_batch_tfm_class=HF_TokenClassBeforeBatchTransform)
        hf_tok_categorize_tfm = get_blurr_tfm(self.learn.dls.tfms[1], tfm_class=HF_TokenCategorize)

        self.hf_tokenizer = tfm.hf_tokenizer
        self.ignore_label_token_id = hf_tok_categorize_tfm.ignore_token_id
        self.tok_special_symbols = list(self.hf_tokenizer.special_tokens_map.values())
        self.tok_kwargs = tfm.kwargs

        # add custom text generation specific metrics
        custom_metric_keys = self.custom_metrics_dict.keys()
        custom_metrics = L([ValueMetric(partial(self.metric_value, metric_key=k), k) for k in custom_metric_keys])
        self.learn.metrics = self.learn.metrics + custom_metrics
        self.learn.token_classification_report = None

        self.do_setup = False

    def before_fit(self):
        self.setup()

    # --- batch begin/after phases ---
    def after_batch(self):
        if self.training or self.learn.y is None:
            return

        # do this only for validation set
        preds = self.pred.argmax(dim=-1)
        targs = self.yb[0]  # yb is TensorText tuple, item 0 is the data

        preds_list, targets_list = [], []
        for i in range(targs.shape[0]):
            item_targs, item_preds = [], []

            for j in range(targs.shape[1]):
                if targs[i, j] != self.ignore_label_token_id:
                    item_preds.append(self.dls.vocab[preds[i][j].item()])
                    item_targs.append(self.dls.vocab[targs[i][j].item()])

            preds_list.append(item_preds)
            targets_list.append(item_targs)

        self.results += [(res[0], res[1]) for res in zip(preds_list, targets_list)]

    # --- validation begin/after phases ---
    def before_validate(self):
        self.results = []

    def after_validate(self):
        if len(self.results) < 1:
            return

        preds, targs = map(list, zip(*self.results))
        for k in self.custom_metrics_dict.keys():
            self.custom_metrics_dict[k] = calculate_token_class_metrics(targs, preds, metric_key=k)

        try:
            self.learn.token_classification_report = calculate_token_class_metrics(targs, preds, "classification_report")
        except ZeroDivisionError as err:
            print(f"Couldn't calcualte classification report: {err}")

    # --- for ValueMetric metrics ---
    def metric_value(self, metric_key):
        return self.custom_metrics_dict[metric_key]


## Mid-level API

### Training

In [None]:
model = HF_BaseModelWrapper(hf_model)
learn_cbs = [HF_BaseModelCallback]
fit_cbs = [HF_TokenClassMetricsCallback()]

learn = Learner(dls, model, opt_func=partial(Adam), cbs=learn_cbs, splitter=hf_splitter)

learn.freeze()


In [None]:
# hide_output
# learn.summary()


In [None]:
b = dls.one_batch()
preds = learn.model(b[0])
len(preds), preds[0].shape


In [None]:
len(b), len(b[0]), b[0]["input_ids"].shape, len(b[1]), b[1].shape


In [None]:
print(preds[0].view(-1, preds[0].shape[-1]).shape, b[1].view(-1).shape)
test_eq(preds[0].view(-1, preds[0].shape[-1]).shape[0], b[1].view(-1).shape[0])


In [None]:
print(len(learn.opt.param_groups))


In [None]:
learn.unfreeze()
learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])


In [None]:
learn.fit_one_cycle(1, lr_max=3e-5, moms=(0.8, 0.7, 0.8), cbs=fit_cbs)


In [None]:
print(learn.token_classification_report)


### `show_results`

Below we'll add in additional functionality to more intuitively show the results of our model.

In [None]:
# export
@typedispatch
def show_results(
    # This typedispatched `show_results` will be called for `HF_TokenClassInput` typed inputs
    x: HF_TokenClassInput,
    # This typedispatched `show_results` will be called for `HF_TokenTensorCategory` typed targets
    y: HF_TokenTensorCategory,
    # Your raw inputs/targets
    samples,
    # The model's predictions
    outs,
    # Your `Learner`. This is required so as to get at the Hugging Face objects for decoding them into
    # something understandable
    learner,
    # Your `show_results` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_results`
    **kwargs,
):
    tfm = first_blurr_tfm(learner.dls, before_batch_tfm_class=HF_TokenClassBeforeBatchTransform)
    hf_tokenizer = tfm.hf_tokenizer
    ignore_token_id = tfm.ignore_token_id
    vocab = learner.dls.vocab

    res = L()
    for inp, trg, sample, pred in zip(x, y, samples, outs):
        # align "tokens" with labels
        tok_labels = align_labels_with_tokens(hf_tokenizer, inp, trg, vocab)
        # align "words" with labels
        word_labels = align_labels_with_words(hf_tokenizer, tok_labels)
        # align "words" with "predicted" labels
        word_pred_labels = [pred_lbl for lbl_id, pred_lbl in zip(trg, ast.literal_eval(pred[0])) if lbl_id != ignore_token_id]

        # stringify list of (word,label) for example
        res.append(
            [
                f"{[ (word_targ[0], word_targ[1], pred_targ) for idx, (word_targ, pred_targ) in enumerate(zip(word_labels, word_pred_labels)) if (trunc_at is None or idx < trunc_at) ]}"
            ]
        )

    display_df(pd.DataFrame(res, columns=["token / target label / predicted label"])[:max_n])
    return ctxs


In [None]:
learn.show_results(learner=learn, max_n=2, trunc_at=10)


In [None]:
res = learn.blurr_predict("My name is Wayde and I live in San Diego".split())
print(res[0][0])


### `blurr_predict_tokens`

The default `Learner.predict` method returns a prediction per subtoken, including the special tokens for each architecture's tokenizer.

In [None]:
# export
def _blurr_predict_tokens(
    # The function to do the base predictions (default: self.blurr_predict)
    predict_func: Callable,
    # The str (or list of strings) you want to get token classification predictions for
    items: Union[str, List[str]],
    # The Blurr Transform with information about the Hugging Face objects used in your training
    tfm: Transform,
):
    """Remove all the unnecessary predicted tokens after calling `Learner.blurr_predict` or `blurrONNX.predict.
    Aligns the predicted labels, label ids, and probabilities with what you passed in excluding subword tokens
    """
    # grab the Hugging Face tokenizer from the learner's dls.tfms
    hf_tokenizer = tfm.hf_tokenizer
    tok_kwargs = tfm.tok_kwargs

    if isinstance(items[0], str):
        items = [items]

    outs = []
    for inp, res in zip(items, predict_func(items)):
        # `blurr_predict returns`` a list for each, we only doing one at a time so git first element of each
        pred_lbls, pred_lbl_ids, probs = res[0][0], res[1][0], res[2][0]

        # calculate the number of subtokens per raw/input token so that we can determine what predictions to return
        subtoks_per_raw_tok = [(entity, len(hf_tokenizer.tokenize(str(entity)))) for entity in inp]

        # very similar to what HF_BatchTransform does with the exception that we are also grabbing the `special_tokens_mask` 
        # to help with getting rid or irelevant predicts for any special tokens (e.g., [CLS], [SEP], etc...)
        res = hf_tokenizer(
            inp,
            None,
            max_length=tfm.max_length,
            padding=tfm.padding,
            truncation=tfm.truncation,
            is_split_into_words=tfm.is_split_into_words,
            **tok_kwargs
        )

        special_toks_msk = L(res["special_tokens_mask"])
        actual_tok_idxs = special_toks_msk.argwhere(lambda el: el != 1)

        # using the indexes to the actual tokens, get that info from the results returned above
        pred_lbls_list = ast.literal_eval(pred_lbls)
        actual_pred_lbls = L(pred_lbls_list)[actual_tok_idxs]
        actual_pred_lbl_ids = pred_lbl_ids[actual_tok_idxs]
        actual_probs = probs[actual_tok_idxs]

        # now, because a raw token can be mapped to multiple subtokens, we need to build a list of indexes composed
        # of the *first* subtoken used to represent each raw token (that is where the prediction is)
        offset = 0
        raw_trg_idxs = []
        for idx, (raw_tok, sub_tok_count) in enumerate(subtoks_per_raw_tok):
            raw_trg_idxs.append(idx + offset)
            offset += sub_tok_count - 1 if (sub_tok_count > 1) else 0

        outs.append((inp, actual_pred_lbls[raw_trg_idxs], actual_pred_lbl_ids[raw_trg_idxs], actual_probs[raw_trg_idxs]))

    return outs


In [None]:
# export
@patch
def blurr_predict_tokens(
    self: Learner,
    # The str (or list of strings) you want to get token classification predictions for
    items: Union[str, List[str]],
    # Keyword arguments for `blurr_predict_tokens`
    **kwargs
):
    tfm = first_blurr_tfm(self.dls, before_batch_tfm_class=HF_TokenClassBeforeBatchTransform)
    return _blurr_predict_tokens(self.blurr_predict, items, tfm)


In [None]:
show_doc(Learner.blurr_predict_tokens)


In [None]:
txt = "Hi! My name is Wayde Gilliam from ohmeow.com. I live in California."
txt2 = "I wish covid was over so I could go to Germany and watch Bayern Munich play in the Bundesliga."


In [None]:
res = learn.blurr_predict_tokens(txt.split())
for r in res:
    print(f"{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n")


In [None]:
res = learn.blurr_predict_tokens([txt.split(), txt2.split()])
for r in res:
    print(f"{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n")


### Inference

In [None]:
export_fname = "tok_class_learn_export"


In [None]:
learn.export(fname=f"{export_fname}.pkl")
inf_learn = load_learner(fname=f"{export_fname}.pkl")

res = learn.blurr_predict_tokens([txt.split(), txt2.split()])
for r in res:
    print(f"{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n")


## High-level API

### `BLearnerForTokenClassification`

In [None]:
# hide
try:
    del learn
    del inf_learn
    torch.cuda.empty_cache()
except:
    pass


In [None]:
# export
@delegates(Blearner.__init__)
class BlearnerForTokenClassification(Blearner):
    def __init__(self, dls: DataLoaders, hf_model: PreTrainedModel, **kwargs):
        super().__init__(dls, hf_model, **kwargs)

    @classmethod
    def get_model_cls(self):
        return AutoModelForTokenClassification

    @classmethod
    def get_metrics_cb(self):
        return HF_TokenClassMetricsCallback()

    @classmethod
    def _create_learner(
        cls,
        # Your raw dataset
        data,
        # The name or path of the pretrained model you want to fine-tune
        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
        # A function to perform any preprocessing required for your Dataset
        preprocess_func: Callable = None,
        # The attribute in your dataset that contains a list of your tokens
        tokens_attr: List[str] = "tokens",
        # The attribute in your dataset that contains the entity labels for each token in your raw text
        token_labels_attr: List[str] = "token_labels",
        # The unique entity labels (or vocab) available in your dataset
        labels: List[str] = None,
        # A function that will split your Dataset into a training and validation set
        # See [here](https://docs.fast.ai/data.transforms.html#Split) for a list of fast.ai splitters
        dblock_splitter: Callable = RandomSplitter(),
        # Any kwargs to pass to your `DataLoaders`
        dl_kwargs={},
        # Any kwargs to pass to your task specific `Blearner`
        learner_kwargs={},
    ):
        # get our hf objects
        n_labels = len(labels)
        hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
            pretrained_model_name_or_path, model_cls=cls.get_model_cls(), config_kwargs={"num_labels": n_labels}
        )

        # if we need to preprocess the raw data before creating our DataLoaders
        if preprocess_func:
            data = preprocess_func(data, hf_arch, hf_config, hf_tokenizer, hf_model, tokens_attr, token_labels_attr, labels)

        # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
        if hf_tokenizer.pad_token is None:
            hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
            hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
            hf_model.resize_token_embeddings(len(hf_tokenizer))

        # build getters
        if isinstance(data, pd.DataFrame):
            get_x = ColReader(tokens_attr)
            get_y = ColReader(token_labels_attr)
        else:
            get_x = ItemGetter(tokens_attr)
            get_y = ItemGetter(token_labels_attr)

        before_batch_tfm = HF_TokenClassBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model)

        blocks = (
            HF_TextBlock(before_batch_tfm=before_batch_tfm, input_return_type=HF_TokenClassInput),
            HF_TokenCategoryBlock(vocab=labels),
        )

        dblock = DataBlock(blocks=blocks, get_x=get_x, get_y=get_y, splitter=dblock_splitter)

        dls = dblock.dataloaders(data, **dl_kwargs.copy())

        # return BLearner instance
        return cls(dls, hf_model, **learner_kwargs.copy())

    @classmethod
    def from_dataframe(
        cls,
        # Your pandas DataFrame
        df: pd.DataFrame,
        # The name or path of the pretrained model you want to fine-tune
        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
        # A function to perform any preprocessing required for your Dataset
        preprocess_func: Callable = None,
        # The attribute in your dataset that contains a list of your tokens
        tokens_attr: List[str] = "tokens",
        # The attribute in your dataset that contains the entity labels for each token in your raw text
        token_labels_attr: List[str] = "token_labels",
        # The unique entity labels (or vocab) available in your dataset
        labels: List[str] = None,
        # A function that will split your Dataset into a training and validation set
        # See [here](https://docs.fast.ai/data.transforms.html#Split) for a list of fast.ai splitters
        dblock_splitter: Callable = ColSplitter(),
        # Any kwargs to pass to your `DataLoaders`
        dl_kwargs={},
        # Any kwargs to pass to your task specific `Blearner`
        learner_kwargs={},
    ):
        # we need to tell transformer how many labels/classes to expect
        if labels is None:
            labels = sorted(list(set([lbls for sublist in df[token_labels_attr].tolist() for lbls in sublist])))

        return cls._create_learner(
            df,
            pretrained_model_name_or_path,
            preprocess_func,
            tokens_attr,
            token_labels_attr,
            labels,
            dblock_splitter,
            dl_kwargs,
            learner_kwargs,
        )

    @classmethod
    def from_csv(
        cls,
        # The path to your csv file
        csv_file: Union[Path, str],
        # The name or path of the pretrained model you want to fine-tune
        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
        # A function to perform any preprocessing required for your Dataset
        preprocess_func: Callable = None,
        # The attribute in your dataset that contains a list of your tokens
        tokens_attr: List[str] = "tokens",
        # The attribute in your dataset that contains the entity labels for each token in your raw text
        token_labels_attr: List[str] = "token_labels",
        # The unique entity labels (or vocab) available in your dataset
        labels: List[str] = None,
        # A function that will split your Dataset into a training and validation set
        # See [here](https://docs.fast.ai/data.transforms.html#Split) for a list of fast.ai splitters
        dblock_splitter: Callable = ColSplitter(),
        # Any kwargs to pass to your `DataLoaders`
        dl_kwargs={},
        # Any kwargs to pass to your task specific `Blearner`
        learner_kwargs={},
    ):
        df = pd.read_csv(csv_file)

        return cls.from_dataframe(
            df,
            pretrained_model_name_or_path=pretrained_model_name_or_path,
            preprocess_func=preprocess_func,
            tokens_attr=tokens_attr,
            token_labels_attr=token_labels_attr,
            labels=labels,
            dblock_splitter=dblock_splitter,
            dl_kwargs=dl_kwargs,
            learner_kwargs=learner_kwargs,
        )

    @classmethod
    def from_dictionaries(
        cls,
        # A list of dictionaries
        ds: List[Dict],
        # The name or path of the pretrained model you want to fine-tune
        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
        # A function to perform any preprocessing required for your Dataset
        preprocess_func: Callable = None,
        # The attribute in your dataset that contains a list of your tokens
        tokens_attr: List[str] = "tokens",
        # The attribute in your dataset that contains the entity labels for each token in your raw text
        token_labels_attr: List[str] = "token_labels",
        # The unique entity labels (or vocab) available in your dataset
        labels: List[str] = None,
        # A function that will split your Dataset into a training and validation set
        # See [here](https://docs.fast.ai/data.transforms.html#Split) for a list of fast.ai splitters
        dblock_splitter: Callable = RandomSplitter(),
        # Any kwargs to pass to your `DataLoaders`
        dl_kwargs={},
        # Any kwargs to pass to your task specific `Blearner`
        learner_kwargs={},
    ):

        # we need to tell transformer how many labels/classes to expect
        if labels is None:
            all_labels = []
            for item in ds:
                all_labels += item[token_labels_attr]
            labels = sorted(list(set(all_labels)))

        return cls._create_learner(
            ds,
            pretrained_model_name_or_path,
            preprocess_func,
            tokens_attr,
            token_labels_attr,
            labels,
            dblock_splitter,
            dl_kwargs,
            learner_kwargs,
        )


In [None]:
learn = BlearnerForTokenClassification.from_dataframe(
    conll2003_df,
    "roberta-base",
    tokens_attr="tokens",
    token_labels_attr="ner_tags",
    labels=labels,
    dblock_splitter=RandomSplitter(),
    dl_kwargs={"bs": 2},
)

learn.unfreeze()


In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2)


In [None]:
# slow
learn.fit_one_cycle(1, lr_max=3e-5, moms=(0.8, 0.7, 0.8), cbs=[BlearnerForTokenClassification.get_metrics_cb()])


In [None]:
learn.show_results(learner=learn, max_n=2, trunc_at=10)


In [None]:
# slow
print(learn.token_classification_report)


In [None]:
txt = "Hi! My name is Wayde Gilliam from ohmeow.com. I live in California."
txt2 = "I wish covid was over so I could watch Lewandowski score some more goals for Bayern Munich in the Bundesliga."


In [None]:
res = learn.blurr_predict_tokens(txt.split())
for r in res:
    print(f"{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n")


In [None]:
res = learn.blurr_predict_tokens([txt.split(), txt2.split()])
for r in res:
    print(f"{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n")


## Tests

The tests below to ensure the token classification training code above works for **all** pretrained token classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained token classification models you are working with ... and if any of your pretrained token classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
try:
    del learn
    torch.cuda.empty_cache()
except:
    pass


In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="TokenClassification") if (not model_type.startswith("TF"))]


In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "camembert-base",
    "google/canine-s",                                  # word_ids
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "microsoft/deberta-v2-xlarge",                      # word_ids
    "sshleifer/tiny-distilbert-base-cased",
    "hf-internal-testing/tiny-electra",
    # "google/fnet-base",                               # forward() got an unexpected keyword argument 'output_attentions'
    "flaubert/flaubert_small_cased",                    # word_ids 
    "huggingface/funnel-small-base",
    "sshleifer/tiny-gpt2",
    "hf-internal-testing/tiny-layoutlm",
    "allenai/longformer-base-4096",
    "microsoft/mpnet-base",
    "kssteven/ibert-roberta-base",
    # "nvidia/megatron-bert-cased-345m",                # could not test           
    "google/mobilebert-uncased",
    'google/rembert',
    "junnyu/roformer_chinese_sim_char_ft_small",                 
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    # "xlm-mlm-en-2048",                                  # word_ids
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
raw_datasets = load_dataset("conll2003")
labels = raw_datasets["train"].features["ner_tags"].feature.names
conll2003_df = pd.DataFrame(raw_datasets["train"])

In [None]:
# hide
model_cls = AutoModelForTokenClassification
bsz = 4
seq_sz = 64

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_kwargs = {"add_prefix_space": True} if 'deberta' in model_name else {}

    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = len(labels)

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, config=config, tokenizer_kwargs=tok_kwargs)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        learn = None

        before_batch_tfm = HF_TokenClassBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz)
        blocks = (HF_TextBlock(before_batch_tfm=before_batch_tfm, input_return_type=HF_TokenClassInput), HF_TokenCategoryBlock(vocab=labels))
        dblock = DataBlock(blocks=blocks, get_x=ColReader("tokens"), get_y=ColReader("ner_tags"), splitter=RandomSplitter())

        dls = dblock.dataloaders(conll2003_df, bs=bsz)

        model = HF_BaseModelWrapper(hf_model)
        learn = Learner(dls, model, opt_func=partial(Adam), cbs=[HF_BaseModelCallback], splitter=hf_splitter).to_fp16()

        learn.create_opt()  # -> will create your layer groups based on your "splitter" function
        learn.freeze()

        b = dls.one_batch()

        print("*** TESTING DataLoaders ***")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        print("*** TESTING Training/Results ***")
        learn.fit_one_cycle(1, lr_max=3e-5, moms=(0.8, 0.7, 0.8), cbs=[ShortEpochCallback(pct=0.1, short_valid=True), HF_TokenClassMetricsCallback(tok_metrics=["accuracy"])])

        test_results.append((hf_arch, type(hf_tokenizer).__name__, type(hf_model).__name__, "PASSED", ""))
        learn.show_results(learner=learn, max_n=2, trunc_at=10)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, type(hf_model).__name__, "FAILED", err))

    finally:
        # cleanup
        if learn:
            del learn
        torch.cuda.empty_cache()


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


## Summary

This module includes all the low, mid, and high-level API bits for token classification tasks training and inference.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()
