In [None]:
# default_exp data.token_classification


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.token_classification

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for token classification tasks (e.g., Named entity recognition (NER), Part-of-speech tagging (POS), etc...)

In [None]:
# export
import ast, os
from typing import Callable, List, Tuple

from datasets import Dataset
from fastcore.all import *
from fastai.data.block import TransformBlock, Category, CategoryMap
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import AutoModelForTokenClassification, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR
from blurr.data.core import Preprocessor, TextInput, BatchTokenizeTransform, first_blurr_tfm

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset
from fastai.data.block import DataBlock, ColReader, ColSplitter
from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.data.external import untar_data, URLs
from fastai.data.transforms import *
from fastcore.test import *
from nbdev.showdoc import show_doc
from transformers import AutoTokenizer

from blurr.utils import print_versions
from blurr.data.core import TextBlock

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.16.2


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `conll2003` to demonstrate how to configure your blurr code for token classification

In [None]:
raw_datasets = load_dataset("conll2003")
raw_datasets


Reusing dataset conll2003 (/home/wgilliam/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
        num_rows: 3453
    })
})

We need to get a list of the distinct entities we want to predict. If they are represented as list in their raw/readable form in another attribute/column in our dataset, we could use something like this to build a sorted list of distinct values as such: `labels = sorted(list(set([lbls for sublist in germ_eval_df.labels.tolist() for lbls in sublist])))`.

Fortunately, the `conll2003` dataset allows us to get at this list directly using the code below.

In [None]:
print(raw_datasets["train"].features["chunk_tags"].feature.names[:20])
print(raw_datasets["train"].features["ner_tags"].feature.names[:20])
print(raw_datasets["train"].features["pos_tags"].feature.names[:20])


['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP']
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS']


In [None]:
# hide
print(raw_datasets["train"][0]["tokens"])
print(raw_datasets["train"][0]["ner_tags"])


['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


In [None]:
labels = raw_datasets["train"].features["ner_tags"].feature.names
labels


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
conll2003_df = pd.DataFrame(raw_datasets["train"])


In [None]:
model_cls = AutoModelForTokenClassification

pretrained_model_name = "roberta-base"  # "bert-base-multilingual-cased"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForTokenClassification)

## Preprocessing

Starting with version 2.0, BLURR provides a token classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets. We also introduce a novel way of handling long documents for this task that ensures tokens associated to a word is not split up in "chunked" documents.  See below for an example.

In [None]:
# export
class TokenClassPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Set to `True` if the preprocessor should chunk examples that exceed `max_length`
        chunk_examples: bool = False,
        # Like "stride" except for words (not tokens)
        word_stride: int = 2,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The label names (if not specified, will build from DataFrame)
        label_names: Optional[List[str]] = None,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the list of words
        word_list_attr: str = "tokens",
        # The attribute holding the list of labels (one for each word in `word_list_attr`)
        label_list_attr: str = "labels",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # If using a slow tokenizer, users will need to prove a `slow_word_ids_func` that accepts a
        # tokenizzer, example index, and a batch encoding as arguments and in turn returnes the
        # equavlient of fast tokenizer's `word_ids`
        slow_word_ids_func: Optional[Callable] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        # tokenizer requires this kwargs when tokenizing text
        tok_kwargs = {**tok_kwargs, **{"is_split_into_words": True}}

        super().__init__(hf_tokenizer, batch_size, text_attr=word_list_attr, tok_kwargs=tok_kwargs)

        self.id_attr = id_attr
        self.label_list_attr = label_list_attr
        self.is_valid_attr = is_valid_attr
        self.label_names = label_names
        self.chunk_examples, self.word_stride = chunk_examples, word_stride

        self.slow_word_ids_func = slow_word_ids_func

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # convert even single "labels" to a list to make things easier
        if self.label_names is None:
            self.label_names = sorted(list(set([lbls for sublist in df[self.label_list_attr].tolist() for lbls in sublist])))

        if self.chunk_examples:
            # "pop" off the max_length so we can manually chunk long documents
            max_length = self.tok_kwargs.pop("max_length", self.hf_tokenizer.model_max_length)
            # a unique Id for each example is required to properly score question answering results when chunking long docs
            if self.id_attr is None:
                df.insert(0, "_id", range(len(df)))
        else:
            # if we're not chunking, just "get" the max_length
            max_length = self.tok_kwargs.get("max_length", self.hf_tokenizer.model_max_length)

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df, self.chunk_examples, max_length))

        final_df.reset_index(drop=True, inplace=True)

        # move the processed bits up to the front
        col = final_df.pop(f"proc_{self.text_attr}")
        final_df.insert(0, col.name, col)
        col = final_df.pop(f"proc_{self.label_list_attr}")
        final_df.insert(1, col.name, col)

        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df, is_chunked, max_length):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs
        if not is_chunked:
            # token classification works with lists of words, so if not listy we resort to splitting by spaces
            batch_df[self.text_attr] = batch_df[self.text_attr].apply(lambda v: v if is_listy(v) else v.split())
            inputs = self._tokenize_function(batch_df.to_dict(orient="list"))

            proc_toks, proc_labels = [], []
            for idx in range(len(inputs["input_ids"])):
                word_ids = inputs.word_ids(idx) if self.hf_tokenizer.is_fast else self.slow_word_ids_func(self.hf_tokenizer, idx, inputs)
                non_special_word_ids = set([word_id for word_id in word_ids if word_id is not None])
                proc_toks.append([batch_df.iloc[idx][self.text_attr][word_id] for word_id in non_special_word_ids])
                proc_labels.append([batch_df.iloc[idx][self.label_list_attr][word_id] for word_id in non_special_word_ids])

            batch_df[f"proc_{self.text_attr}"] = pd.Series(proc_toks)
            batch_df[f"proc_{self.label_list_attr}"] = pd.Series(proc_labels)
            return batch_df

        # if we get here, we need create "chunked" inputs/labels from the existing input/label ensuring that
        # words are *not* broken up between chunks
        proc_data = []
        for row_idx, row in batch_df.iterrows():
            # fetch word list and words' label list (there should be 1 label per word)
            words = row[self.text_attr] if is_listy(row[self.text_attr]) else row[self.text_attr].split()
            word_labels = row[self.label_list_attr]

            inputs = hf_tokenizer(words, **self.tok_kwargs)
            word_ids = inputs.word_ids() if self.hf_tokenizer.is_fast else self.slow_word_ids_func(self.hf_tokenizer, 0, inputs)

            non_special_word_ids = [id for id in word_ids if id is not None]
            max_chunk_length = max_length - self.hf_tokenizer.num_special_tokens_to_add()

            start_idx, current_word_id, current_chunk_length = 0, 0, 0
            chunks = []
            while True:
                last_idx = len(non_special_word_ids) - 1 - non_special_word_ids[::-1].index(current_word_id)
                current_chunk_length = len(non_special_word_ids[start_idx : last_idx + 1])

                if current_chunk_length >= max_chunk_length:
                    # we need to add a chunk
                    if current_chunk_length > max_chunk_length:
                        # only when the current chunk in > the max chunk length do we want to modify the "last_indx" (if
                        # equal then we want to use the current value)
                        last_idx = len(non_special_word_ids) - 1 - non_special_word_ids[::-1].index(max(0, current_word_id - 1))
                    chunks.append(non_special_word_ids[start_idx : last_idx + 1])

                    # start a new chunk
                    current_chunk_length = 0

                    if self.word_stride == 0 or non_special_word_ids.index(max(0, current_word_id - self.word_stride)) <= start_idx:
                        # if "word_stride" = 0 or going back "word_stride" would lead to infinite recurssion because it would go
                        # back beyond the start of the last chunk, we don't "word_stride" ... we just move to next token
                        start_idx = last_idx + 1
                    else:
                        current_word_id -= self.word_stride - 1
                        start_idx = non_special_word_ids.index(current_word_id)

                current_word_id += 1

                if current_word_id >= max(non_special_word_ids):
                    # add any inprogress chunk
                    if current_chunk_length > 0:
                        chunks.append(non_special_word_ids[start_idx:])
                    break

            for chunk in chunks:
                overflow_row = row.copy()
                overflow_row[f"proc_{self.text_attr}"] = [words[word_id] for word_id in list(set(chunk))]
                overflow_row[f"proc_{self.label_list_attr}"] = [word_labels[word_id] for word_id in list(set(chunk))]
                proc_data.append(overflow_row)

        return pd.DataFrame(proc_data)


How to preprocess your data (labels are Ids)

In [None]:
preprocessor = TokenClassPreprocessor(
    hf_tokenizer,
    chunk_examples=True,
    word_stride=2,
    label_names=labels,
    id_attr="id",
    word_list_attr="tokens",
    label_list_attr="ner_tags",
    tok_kwargs={"max_length": 8},
)
proc_df = preprocessor.process_df(conll2003_df)

print(len(proc_df))
print(preprocessor.label_names)
proc_df.head(4)


61298
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


Unnamed: 0,proc_tokens,proc_ner_tags,chunk_tags,id,ner_tags,pos_tags,tokens
0,"[EU, rejects, German, call, to, boycott]","[3, 0, 7, 0, 0, 0]","[11, 21, 11, 12, 21, 22, 11, 12, 0]",0,"[3, 0, 7, 0, 0, 0, 7, 0, 0]","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[EU, rejects, German, call, to, boycott, British, lamb, .]"
1,"[to, boycott, British, lamb, .]","[0, 0, 7, 0, 0]","[11, 21, 11, 12, 21, 22, 11, 12, 0]",0,"[3, 0, 7, 0, 0, 0, 7, 0, 0]","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[EU, rejects, German, call, to, boycott, British, lamb, .]"
2,"[Peter, Blackburn]","[1, 2]","[11, 12]",1,"[1, 2]","[22, 22]","[Peter, Blackburn]"
3,"[BRUSSELS, 1996-08-22]","[5, 0]","[11, 12]",2,"[5, 0]","[22, 11]","[BRUSSELS, 1996-08-22]"


How to preprocess your data (labels are entity names)

In [None]:
conll2003_labeled_df = conll2003_df.copy()
conll2003_labeled_df.ner_tags = conll2003_labeled_df.ner_tags.apply(lambda v: [labels[lbl_id] for lbl_id in v])
conll2003_labeled_df.head(5)


Unnamed: 0,chunk_tags,id,ner_tags,pos_tags,tokens
0,"[11, 21, 11, 12, 21, 22, 11, 12, 0]",0,"[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[EU, rejects, German, call, to, boycott, British, lamb, .]"
1,"[11, 12]",1,"[B-PER, I-PER]","[22, 22]","[Peter, Blackburn]"
2,"[11, 12]",2,"[B-LOC, O]","[22, 11]","[BRUSSELS, 1996-08-22]"
3,"[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17, 11, 21, 17, 11, 12, 12, 21, 22, 22, 13, 11, 0]",3,"[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O, O, O, O, O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7]","[The, European, Commission, said, on, Thursday, it, disagreed, with, German, advice, to, consumers, to, shun, British, lamb, until, scientists, determine, whether, mad, cow, disease, can, be, transmitted, to, sheep, .]"
4,"[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1, 13, 11, 17, 11, 12, 12, 21, 1, 0]",4,"[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O, O, O, O, O, O, O]","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]","[Germany, 's, representative, to, the, European, Union, 's, veterinary, committee, Werner, Zwingmann, said, on, Wednesday, consumers, should, buy, sheepmeat, from, countries, other, than, Britain, until, the, scientific, advice, was, clearer, .]"


In [None]:
preprocessor = TokenClassPreprocessor(
    hf_tokenizer, label_names=labels, id_attr="id", word_list_attr="tokens", label_list_attr="ner_tags", tok_kwargs={"max_length": 8}
)
proc_df = preprocessor.process_df(conll2003_labeled_df)

print(len(proc_df))
print(preprocessor.label_names)
proc_df.head(4)


14041
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


Unnamed: 0,proc_tokens,proc_ner_tags,chunk_tags,id,ner_tags,pos_tags,tokens
0,"[EU, rejects, German, call, to, boycott]","[B-ORG, O, B-MISC, O, O, O]","[11, 21, 11, 12, 21, 22, 11, 12, 0]",0,"[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[EU, rejects, German, call, to, boycott, British, lamb, .]"
1,"[Peter, Blackburn]","[B-PER, I-PER]","[11, 12]",1,"[B-PER, I-PER]","[22, 22]","[Peter, Blackburn]"
2,"[BRUSSELS, 1996-08-22]","[B-LOC, O]","[11, 12]",2,"[B-LOC, O]","[22, 11]","[BRUSSELS, 1996-08-22]"
3,"[The, European, Commission, said, on, Thursday]","[O, B-ORG, I-ORG, O, O, O]","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17, 11, 21, 17, 11, 12, 12, 21, 22, 22, 13, 11, 0]",3,"[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O, O, O, O, O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7]","[The, European, Commission, said, on, Thursday, it, disagreed, with, German, advice, to, consumers, to, shun, British, lamb, until, scientists, determine, whether, mad, cow, disease, can, be, transmitted, to, sheep, .]"


## Labeling strategies

### `BaseLabelingStrategy` and implementations

Here we include a `BaseLabelingStrategy` abstract class and several different strategies for assigning labels to your tokenized inputs. The "only first token" and "B/I" labeling strategies are discussed in the ["Token Classification"](https://huggingface.co/course/chapter7/2?fw=pt) section in part 7 of the Hugging Face's Transformers course.

In [None]:
# export
class BaseLabelingStrategy:
    def __init__(self, hf_tokenizer: PreTrainedTokenizerBase, label_names: Optional[List[str]], ignore_token_id: int = CrossEntropyLossFlat().ignore_index) -> None:
        self.hf_tokenizer = hf_tokenizer
        self.ignore_token_id = ignore_token_id
        self.label_names = label_names

    def align_labels_with_tokens(self, word_ids, word_labels):
        raise NotImplementedError()


In [None]:
# export
class OnlyFirstTokenLabelingStrategy(BaseLabelingStrategy):
    """
    Only the first token of word is associated with the label (all other subtokens with the `ignore_index_id`). Works where labels
    are Ids or strings (in the later case we'll use the `label_names` to look up it's Id)
    """

    def align_labels_with_tokens(self, word_ids, word_labels):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # start of a new word
                current_word = word_id
                label = self.ignore_token_id if word_id is None else word_labels[word_id]
                new_labels.append(label if isinstance(label, int) else self.label_names.index(label))
            else:
                # special token or another subtoken of current word
                new_labels.append(self.ignore_token_id)

        return new_labels


class SameLabelLabelingStrategy(BaseLabelingStrategy):
    """
    Every token associated with a given word is associated with the word's label. Works where labels
    are Ids or strings (in the later case we'll use the `label_names` to look up it's Id)
    """

    def align_labels_with_tokens(self, word_ids, word_labels):
        new_labels = []
        for word_id in word_ids:
            if word_id == None:
                new_labels.append(self.ignore_token_id)
            else:
                label = word_labels[word_id]
                new_labels.append(label if isinstance(label, int) else self.label_names.index(label))

        return new_labels


class BILabelingStrategy(BaseLabelingStrategy):
    """
    If using B/I labels, the first token assoicated to a given word gets the "B" label while all other tokens related
    to that same word get "I" labels.  If "I" labels don't exist, this strategy behaves like the `OnlyFirstTokenLabelingStrategy`.
    Works where labels are Ids or strings (in the later case we'll use the `label_names` to look up it's Id)
    """

    def align_labels_with_tokens(self, word_ids, word_labels):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # start of a new word
                current_word = word_id
                label = self.ignore_token_id if word_id is None else word_labels[word_id]
                new_labels.append(label if isinstance(label, int) else self.label_names.index(label))
            elif word_id is None:
                # special token
                new_labels.append(self.ignore_token_id)
            else:
                # we're in the same word
                label = word_labels[word_id]
                label_name = self.label_names[label] if isinstance(label, int) else label

                # append the I-{ENTITY} if it exists in `labels`, else default to the `same_label` strategy
                iLabel = f"I-{label_name[2:]}"
                new_labels.append(self.label_names.index(iLabel) if iLabel in self.label_names else self.ignore_token_id)

        return new_labels


### Reconstructing inputs/labels

The utility methods below allow blurr users to reconstruct the original word/label associations from the input_ids/label associations.  For example, these are used in our token classification `show_batch` method below.

In [None]:
# export
def get_token_labels_from_input_ids(
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # List of input_ids for the tokens in a single piece of processed text
    input_ids: List[int],
    # List of label indexs for each token
    token_label_ids: List[int],
    # List of label names from witch the `label` indicies can be used to find the name of the label
    vocab: List[str],
    # The token ID that should be ignored when calculating the loss
    ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
    # The token used to identifiy ignored tokens (default: [xIGNx])
    ignore_token: str = "[xIGNx]",
) -> List[Tuple[str, str]]:
    """
    Given a list of input IDs, the label ID associated to each, and the labels vocab, this method will return a list of tuples whereby
    each tuple defines the "token" and its label name. For example:
    [('ĠWay', B-PER), ('de', B-PER), ('ĠGill', I-PER), ('iam', I-PER), ('Ġloves'), ('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG)]
    """
    # convert ids to tokens
    toks = hf_tokenizer.convert_ids_to_tokens(input_ids)
    # align "tokens" with labels
    tok_labels = [
        (tok, ignore_token if label_id == ignore_token_id else vocab[label_id])
        for tok_id, tok, label_id in zip(input_ids, toks, token_label_ids)
        if tok_id not in hf_tokenizer.all_special_ids
    ]
    return tok_labels


In [None]:
# TESTS for align_labels_with_tokens()
for idx in range(3):
    raw_word_list = conll2003_df.iloc[idx]["tokens"]
    raw_label_list = conll2003_df.iloc[idx]["ner_tags"]

    be = hf_tokenizer(raw_word_list, is_split_into_words=True)
    input_ids = be["input_ids"]
    targ_ids = [-100 if (word_id == None) else raw_label_list[word_id] for word_id in be.word_ids()]

    tok_labels = get_token_labels_from_input_ids(hf_tokenizer, input_ids, targ_ids, labels)

    for tok_label, targ_id in zip(tok_labels, [label_id for label_id in targ_ids if label_id != -100]):
        test_eq(tok_label[1], labels[targ_id])


In [None]:
show_doc(get_token_labels_from_input_ids)


<h4 id="get_token_labels_from_input_ids" class="doc_header"><code>get_token_labels_from_input_ids</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_token_labels_from_input_ids</code>(**`hf_tokenizer`**:`PreTrainedTokenizerBase`, **`input_ids`**:`List`\[`int`\], **`token_label_ids`**:`List`\[`int`\], **`vocab`**:`List`\[`str`\], **`ignore_token_id`**:`int`=*`-100`*, **`ignore_token`**:`str`=*`'[xIGNx]'`*)

Given a list of input IDs, the label ID associated to each, and the labels vocab, this method will return a list of tuples whereby
each tuple defines the "token" and its label name. For example:
[('ĠWay', B-PER), ('de', B-PER), ('ĠGill', I-PER), ('iam', I-PER), ('Ġloves'), ('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG)]

||Type|Default|Details|
|---|---|---|---|
|**`hf_tokenizer`**|`PreTrainedTokenizerBase`||A Hugging Face tokenizer|
|**`input_ids`**|`List[int]`||List of input_ids for the tokens in a single piece of processed text|
|**`token_label_ids`**|`List[int]`||List of label indexs for each token|
|**`vocab`**|`List[str]`||List of label names from witch the `label` indicies can be used to find the name of the label|
|**`ignore_token_id`**|`int`|`-100`|The token ID that should be ignored when calculating the loss|
|**`ignore_token`**|`str`|`[xIGNx]`|The token used to identifiy ignored tokens (default: [xIGNx])|


In [None]:
# export
def get_word_labels_from_token_labels(
    hf_arch: str,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...])
    tok_labels,
) -> List[Tuple[str, str]]:
    """
    Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the
    "word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method,
    allows the user to reconstruct the orginal raw inputs and labels.
    """
    # recreate raw words list (we assume for token classification that the input is a list of words)
    words = hf_tokenizer.convert_tokens_to_string([tok_label[0] for tok_label in tok_labels]).split()

    if hf_arch == "canine":
        word_list = [f"{word} " for word in words]
    else:
        word_list = [word for word in words]

    # align "words" with labels
    word_labels, idx = [], 0
    for word in word_list:
        word_labels.append((word, tok_labels[idx][1]))
        idx += len(hf_tokenizer.tokenize(word))

    return word_labels


In [None]:
# TESTS for align_labels_with_words()
for idx in range(5):
    raw_word_list = conll2003_df.iloc[idx]["tokens"]
    raw_label_list = conll2003_df.iloc[idx]["ner_tags"]

    be = hf_tokenizer(raw_word_list, is_split_into_words=True)
    input_ids = be["input_ids"]
    targ_ids = [-100 if (word_id == None) else raw_label_list[word_id] for word_id in be.word_ids()]

    tok_labels = get_token_labels_from_input_ids(hf_tokenizer, input_ids, targ_ids, labels)
    word_labels = get_word_labels_from_token_labels(hf_arch, hf_tokenizer, tok_labels)

    for word_label, raw_word, raw_label_id in zip(word_labels, raw_word_list, raw_label_list):
        test_eq(word_label[0], raw_word)
        test_eq(word_label[1], labels[raw_label_id])


In [None]:
show_doc(get_word_labels_from_token_labels)


<h4 id="get_word_labels_from_token_labels" class="doc_header"><code>get_word_labels_from_token_labels</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_word_labels_from_token_labels</code>(**`hf_arch`**:`str`, **`hf_tokenizer`**:`PreTrainedTokenizerBase`, **`tok_labels`**)

Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the
"word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method,
allows the user to reconstruct the orginal raw inputs and labels.

||Type|Default|Details|
|---|---|---|---|
|**`hf_arch`**|`str`||*No Content*|
|**`hf_tokenizer`**|`PreTrainedTokenizerBase`||A Hugging Face tokenizer|
|**`tok_labels`**|||A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...])|


## Mid-level API

### Targets

#### `TokenTensorCategory`

In [None]:
# export
class TokenTensorCategory(TensorBase):
    pass


#### `TokenCategorize`

`TokenCategorize` modifies the fastai `Categorize` transform in a couple of ways.  

First, it allows your targets to consist of a `Category` ***per*** token, and second, it uses the idea of an `ignore_token_id` to mask subtokens that don't need a prediction.  For example, the target of special tokens (e.g., pad, cls, sep) are set to `ignore_token_id` as are subsequent sub-tokens of a given token should more than 1 sub-token make it up.

In [None]:
# export
class TokenCategorize(Transform):
    """Reversible transform of a list of category string to `vocab` id"""

    def __init__(
        self,
        # The unique list of entities (e.g., B-LOC) (default: CategoryMap(vocab))
        vocab: List[str] = None,
        # The token used to identifiy ignored tokens (default: xIGNx)
        ignore_token: str = "[xIGNx]",
        # The token ID that should be ignored when calculating the loss (default: CrossEntropyLossFlat().ignore_index)
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
    ):
        self.vocab = None if vocab is None else CategoryMap(vocab, sort=False)
        self.ignore_token, self.ignore_token_id = ignore_token, ignore_token_id

        self.loss_func, self.order = CrossEntropyLossFlat(ignore_index=self.ignore_token_id), 1

    def setups(self, dsets):
        if self.vocab is None and dsets is not None:
            self.vocab = CategoryMap(dsets)
        self.c = len(self.vocab)

    def encodes(self, labels):
        # if `val` is the label name (e.g., B-PER, I-PER, etc...), lookup the corresponding index in the vocab using
        # `self.vocab.o2i`
        ids = [val if (isinstance(val, int)) else self.vocab.o2i[val] for val in labels]
        return TokenTensorCategory(ids)

    def decodes(self, encoded_labels):
        return Category([(self.vocab[lbl_id]) for lbl_id in encoded_labels if lbl_id != self.ignore_token_id])


#### `TokenCategoryBlock`

In [None]:
# export
def TokenCategoryBlock(
    # The unique list of entities (e.g., B-LOC) (default: CategoryMap(vocab))
    vocab: Optional[List[str]] = None,
    # The token used to identifiy ignored tokens (default: xIGNx)
    ignore_token: str = "[xIGNx]",
    # The token ID that should be ignored when calculating the loss (default: CrossEntropyLossFlat().ignore_index)
    ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
):
    """`TransformBlock` for per-token categorical targets"""
    return TransformBlock(type_tfms=TokenCategorize(vocab=vocab, ignore_token=ignore_token, ignore_token_id=ignore_token_id))


In [None]:
show_doc(TokenCategoryBlock)


<h4 id="TokenCategoryBlock" class="doc_header"><code>TokenCategoryBlock</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>TokenCategoryBlock</code>(**`vocab`**:`Optional`\[`List`\[`str`\]\]=*`None`*, **`ignore_token`**:`str`=*`'[xIGNx]'`*, **`ignore_token_id`**:`int`=*`-100`*)

`TransformBlock` for per-token categorical targets

||Type|Default|Details|
|---|---|---|---|
|**`vocab`**|`List[str]]`|``|The unique list of entities (e.g., B-LOC) (default: CategoryMap(vocab))|
|**`ignore_token`**|`str`|`[xIGNx]`|The token used to identifiy ignored tokens (default: xIGNx)|
|**`ignore_token_id`**|`int`|`-100`|The token ID that should be ignored when calculating the loss (default: CrossEntropyLossFlat().ignore_index)|


### Inputs

#### `TokenClassTextInput`

Again, we define a custom class, `TokenClassTextInput`, for the `@typedispatch`ed methods to use so that we can override how token classification inputs/targets are assembled, as well as, how the data is shown via methods like `show_batch` and `show_results`.

In [None]:
# export
class TokenClassTextInput(TextInput):
    pass


#### `TokenClassBatchTokenizeTransform`

`TokenClassBatchTokenizeTransform` is used to exclude any of the target's tokens we don't want to include in the loss calcuation (e.g. padding, cls, sep, etc...). 

Note also that we default `is_split_into_words = True` since token classification tasks expect a list of words and labels for each word.

In [None]:
# export
class TokenClassBatchTokenizeTransform(BatchTokenizeTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The labeling strategy you want to apply when associating labels with word tokens
        labeling_strategy_cls: BaseLabelingStrategy = OnlyFirstTokenLabelingStrategy,
        # the target label names
        target_label_names: Optional[List[str]] = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: Optional[int] = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = True,
        # If using a slow tokenizer, users will need to prove a `slow_word_ids_func` that accepts a
        # tokenizzer, example index, and a batch encoding as arguments and in turn returnes the
        # equavlient of fast tokenizer's `word_ids``
        slow_word_ids_func: Optional[Callable] = None,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `TokenClassBatchTokenizeTransform`
        **kwargs
    ):

        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            include_labels=include_labels,
            ignore_token_id=ignore_token_id,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=is_split_into_words,
            tok_kwargs=tok_kwargs,
            **kwargs
        )

        self.labeling_strategy = labeling_strategy_cls(hf_tokenizer, label_names=target_label_names, ignore_token_id=ignore_token_id)
        self.target_label_names = target_label_names
        self.slow_word_ids_func = slow_word_ids_func

    def encodes(self, samples):
        encoded_samples, inputs = super().encodes(samples, return_batch_encoding=True)

        # if there are no targets (e.g., when used for inference)
        if len(encoded_samples[0]) == 1:
            return encoded_samples

        # get the type of our targets (by default will be TokenTensorCategory)
        target_cls = type(encoded_samples[0][1])

        updated_samples = []
        for idx, s in enumerate(encoded_samples):
            # with batch-time tokenization, we have to align each token with the correct label using the `word_ids` in the
            # batch encoding object we get from calling our *fast* tokenizer
            word_ids = inputs.word_ids(idx) if self.hf_tokenizer.is_fast else self.slow_word_ids_func(self.hf_tokenizer, idx, inputs)
            targ_ids = target_cls(self.labeling_strategy.align_labels_with_tokens(word_ids, s[1].tolist()))

            if self.include_labels and len(targ_ids) > 0:
                s[0]["labels"] = targ_ids

            updated_samples.append((s[0], targ_ids))

        return updated_samples


## Examples

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "distilroberta-base"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=AutoModelForTokenClassification, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForTokenClassification)

#####  Step 2: Create your `DataBlock`

In [None]:
batch_tok_tfm = TokenClassBatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, target_label_names=labels)
blocks = (TextBlock(batch_tokenize_tfm=batch_tok_tfm, input_return_type=TokenClassTextInput), TokenCategoryBlock(vocab=labels))

dblock = DataBlock(blocks=blocks, get_x=ColReader("tokens"), get_y=ColReader("ner_tags"), splitter=RandomSplitter())


In [None]:
# hide
# dblock.summary(conll2003_df)


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(conll2003_df, bs=4)


In [None]:
b = dls.one_batch()


In [None]:
len(b), b[0]["input_ids"].shape, b[1].shape


(2, torch.Size([4, 156]), torch.Size([4, 156]))

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `TokenClassTextInput` typed inputs
    x: TokenClassTextInput,
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders, tfms=[TokenClassBatchTokenizeTransform])
    hf_arch, hf_tokenizer = tfm.hf_arch, tfm.hf_tokenizer
    vocab = dataloaders.vocab

    res = L()
    for inp, trg, sample in zip(x, y, samples):
        # align "tokens" with labels
        tok_labels = get_token_labels_from_input_ids(hf_tokenizer, inp, trg, vocab)
        # align "words" with labels
        word_labels = get_word_labels_from_token_labels(hf_arch, hf_tokenizer, tok_labels)
        # stringify list of (word,label) for example
        res.append([f"{[ word_targ for idx, word_targ in enumerate(word_labels) if (trunc_at is None or idx < trunc_at) ]}"])

    display_df(pd.DataFrame(res, columns=["word / target label"])[:max_n])
    return ctxs


In [None]:
dls.show_batch(dataloaders=dls, max_n=5, trunc_at=20)


Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('""', 'O'), ('There', 'O'), ('are', 'O'), ('no', 'O'), ('parks', 'O'), ('or', 'O'), ('empty', 'O'), ('areas', 'O'), ('of', 'O'), ('land', 'O'), ('around', 'O'), ('here', 'O'), (',', 'O'), ('so', 'O'), ('when', 'O'), ('we', 'O'), ('want', 'O'), ('to', 'O'), ('play', 'O'), ('a', 'O')]"
2,"[('The', 'B-ORG'), ('Statesman', 'I-ORG'), ('newspaper', 'O'), ('quoted', 'O'), ('40-year-old', 'O'), ('Mangala', 'B-PER'), ('Das', 'I-PER'), (',', 'O'), ('paralysed', 'O'), ('from', 'O'), ('her', 'O'), ('waist', 'O'), ('down', 'O'), ('and', 'O'), ('a', 'O'), ('resident', 'O'), ('of', 'O'), ('the', 'O'), ('Prem', 'B-ORG'), ('Dan', 'I-ORG')]"
3,"[('Provision', 'O'), ('of', 'O'), ('overland', 'O'), ('transport', 'O'), ('services', 'O'), ('for', 'O'), ('material', 'O'), ('and', 'O'), ('equipment', 'O'), ('for', 'O'), ('European', 'B-ORG'), ('Commission', 'I-ORG'), ('delegations', 'O'), ('in', 'O'), ('European', 'B-MISC'), ('Third', 'I-MISC'), ('Countries', 'I-MISC'), ('and', 'O'), ('in', 'O'), ('the', 'O')]"


#### Passing extra infromation

##### Step 1b: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "distilroberta-base"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=AutoModelForTokenClassification, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForTokenClassification)

##### Step 1b. Preprocess dataset

In [None]:
preprocessor = TokenClassPreprocessor(
    hf_tokenizer,
    label_names=labels,
    id_attr="id",
    word_list_attr="tokens",
    label_list_attr="ner_tags",
    tok_kwargs={"max_length": 128},
)
proc_df = preprocessor.process_df(conll2003_df)
proc_df.head(2)


Unnamed: 0,proc_tokens,proc_ner_tags,chunk_tags,id,ner_tags,pos_tags,tokens
0,"[EU, rejects, German, call, to, boycott, British, lamb, .]","[3, 0, 7, 0, 0, 0, 7, 0, 0]","[11, 21, 11, 12, 21, 22, 11, 12, 0]",0,"[3, 0, 7, 0, 0, 0, 7, 0, 0]","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[EU, rejects, German, call, to, boycott, British, lamb, .]"
1,"[Peter, Blackburn]","[1, 2]","[11, 12]",1,"[1, 2]","[22, 22]","[Peter, Blackburn]"


##### Step 2: Create your `DataBlock`

In [None]:
batch_tok_tfm = TokenClassBatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, target_label_names=labels)
blocks = (TextBlock(batch_tokenize_tfm=batch_tok_tfm, input_return_type=TokenClassTextInput), TokenCategoryBlock(vocab=labels))


def get_x(item):
    return {"id": item.id, "text": item.proc_tokens}


dblock = DataBlock(blocks=blocks, get_x=get_x, get_y=ColReader("proc_ner_tags"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)


In [None]:
b = dls.one_batch()
b[0].keys()


dict_keys(['input_ids', 'attention_mask', 'id', 'labels'])

In [None]:
len(b), b[0]["input_ids"].shape, b[1].shape


(2, torch.Size([4, 130]), torch.Size([4, 130]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=5, trunc_at=20)


Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Under', 'O'), ('the', 'O'), ('agreement', 'O'), (',', 'O'), ('IVAC', 'B-ORG'), ('and', 'O'), ('Advanced', 'B-ORG'), ('Medical', 'I-ORG'), (""'s"", 'O'), ('wholly', 'O'), ('owned', 'O'), ('subsidiary', 'O'), (',', 'O'), ('IMED', 'B-ORG'), ('Corp.', 'I-ORG'), (',', 'O'), ('will', 'O'), ('merge', 'O'), ('to', 'O'), ('form', 'O')]"
2,"[('""', 'O'), ('They', 'O'), (""'re"", 'O'), ('aiming', 'O'), ('millions', 'O'), ('and', 'O'), ('millions', 'O'), ('of', 'O'), ('missiles', 'O'), ('right', 'O'), ('at', 'O'), ('these', 'O'), ('young', 'O'), ('people', 'O'), (',', 'O'), ('whether', 'O'), ('it', 'O'), (""'s"", 'O'), ('a', 'O'), ('needle', 'O')]"
3,"[('It', 'O'), ('also', 'O'), ('cited', 'O'), ('the', 'O'), ('potential', 'O'), ('problem', 'O'), ('of', 'O'), ('a', 'O'), ('two-tiered', 'O'), ('market', 'O'), ('in', 'O'), ('which', 'O'), ('market', 'O'), ('makers', 'O'), ('quote', 'O'), ('one', 'O'), ('price', 'O'), ('to', 'O'), ('public', 'O'), ('investors', 'O')]"


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained token classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained token classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="TokenClassification") if (not model_type.startswith("TF"))]


['AlbertForTokenClassification',
 'BertForTokenClassification',
 'BigBirdForTokenClassification',
 'CamembertForTokenClassification',
 'CanineForTokenClassification',
 'ConvBertForTokenClassification',
 'DebertaForTokenClassification',
 'DebertaV2ForTokenClassification',
 'DistilBertForTokenClassification',
 'ElectraForTokenClassification',
 'FNetForTokenClassification',
 'FlaubertForTokenClassification',
 'FunnelForTokenClassification',
 'GPT2ForTokenClassification',
 'IBertForTokenClassification',
 'LayoutLMForTokenClassification',
 'LayoutLMv2ForTokenClassification',
 'LongformerForTokenClassification',
 'MPNetForTokenClassification',
 'MegatronBertForTokenClassification',
 'MobileBertForTokenClassification',
 'NystromformerForTokenClassification',
 'RemBertForTokenClassification',
 'RoFormerForTokenClassification',
 'RobertaForTokenClassification',
 'SqueezeBertForTokenClassification',
 'XLMForTokenClassification',
 'XLMRobertaForTokenClassification',
 'XLNetForTokenClassification'

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "camembert-base",
    # "google/canine-s",                                  # word_ids
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    # "microsoft/deberta-v2-xlarge",                      # word_ids
    "sshleifer/tiny-distilbert-base-cased",
    "hf-internal-testing/tiny-electra",
    # "google/fnet-base",                               # forward() got an unexpected keyword argument 'output_attentions'
    # "flaubert/flaubert_small_cased",                    # word_ids
    "huggingface/funnel-small-base",
    "sshleifer/tiny-gpt2",
    "hf-internal-testing/tiny-layoutlm",
    "allenai/longformer-base-4096",
    "microsoft/mpnet-base",
    "kssteven/ibert-roberta-base",
    # "nvidia/megatron-bert-cased-345m",                # could not test
    "google/mobilebert-uncased",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    # "xlm-mlm-en-2048",                                  # word_ids
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
raw_datasets = load_dataset("conll2003")
conll2003_df = pd.DataFrame(raw_datasets["train"])

labels = raw_datasets["train"].features["ner_tags"].feature.names


Reusing dataset conll2003 (/home/wgilliam/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# hide
model_cls = AutoModelForTokenClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_kwargs = {"add_prefix_space": True} if "deberta" in model_name else {}

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_kwargs=tok_kwargs)
    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    def get_x(r):
        if hf_arch == "canine":
            return [f"{word} " for word in r.tokens]
        else:
            return r.tokens

    batch_tok_tfm = TokenClassBatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz)
    blocks = (TextBlock(batch_tokenize_tfm=batch_tok_tfm, input_return_type=TokenClassTextInput), TokenCategoryBlock(vocab=labels))
    dblock = DataBlock(blocks=blocks, get_x=get_x, get_y=ColReader("ner_tags"), splitter=RandomSplitter())

    dls = dblock.dataloaders(conll2003_df, bs=bsz)
    b = dls.one_batch()

    print("*** TESTING DataLoaders ***\n")
    test_eq(len(b), 2)
    test_eq(len(b[0]["input_ids"]), bsz)
    test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
    test_eq(len(b[1]), bsz)

    if hasattr(hf_tokenizer, "add_prefix_space"):
        test_eq(hf_tokenizer.add_prefix_space, True)

    test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
    dls.show_batch(dataloaders=dls, max_n=2, trunc_at=20)

    # except Exception as err:
    #     test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('market', 'O'), ('talk', 'O'), ('-', 'O'), ('usda', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('august', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('with', 'O'), ('her', 'O'), ('husband', 'O'), ('james', 'B-PER'), ('sitting', 'O'), ('in', 'O'), ('a', 'O'), ('wheelchair', 'O'), ('to', 'O'), ('the', 'O'), ('side', 'O'), ('of', 'O'), ('the', 'O'), ('podium', 'O'), (',', 'O'), ('mrs.', 'O'), ('brady', 'B-PER'), ('called', 'O'), ('the', 'O'), ('handgun', 'O')]"


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('market', 'O'), ('talk', 'O'), ('-', 'O'), ('usda', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('august', 'O'), ('22,', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O'), ('and', 'O')]"
1,"[('he', 'O'), ('said', 'O'), ('his', 'O'), ('farm,', 'O'), ('its', 'O'), ('260', 'O'), ('workers', 'O'), ('now', 'O'), ('readying', 'O'), ('the', 'O'), ('fields', 'O'), ('for', 'O'), ('winter', 'O'), ('wheat', 'O'), ('sowing,', 'O'), ('expected', 'O'), ('the', 'O'), ('land', 'O'), ('to', 'O'), ('yield', 'O')]"


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('15', 'O'), ('-', 'O'), ('Christian', 'B-PER'), ('Cullen', 'I-PER'), (',', 'O'), ('14', 'O'), ('-', 'O'), ('Jeff', 'B-PER'), ('Wilson', 'I-PER'), (',', 'O'), ('13', 'O'), ('-', 'O'), ('Walter', 'B-PER'), ('Little', 'I-PER'), (',', 'O'), ('12', 'O'), ('-', 'O'), ('Frank', 'B-PER'), ('Bunce', 'I-PER'), (',', 'O')]"
1,"[('Japan', 'B-LOC'), (""'s"", 'O'), ('Finance', 'O'), ('Minister', 'O'), ('Wataru', 'B-PER'), ('Kubo', 'I-PER'), ('told', 'O'), ('a', 'O'), ('news', 'O'), ('conference', 'O'), ('on', 'O'), ('Wednesday', 'O'), ('that', 'O'), ('he', 'O'), ('believes', 'O'), ('that', 'O'), ('the', 'O'), ('Bank', 'B-ORG'), ('of', 'I-ORG'), ('Japan', 'I-ORG')]"


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Chief', 'O'), ('Accountant', 'O'), ('Natalya', 'B-PER'), ('Sypron', 'I-PER'), ('said', 'O'), ('the', 'O'), ('collective', 'O'), ('is', 'O'), ('strapped', 'O'), ('for', 'O'), ('cash', 'O'), (',', 'O'), ('earlier', 'O'), ('this', 'O'), ('year', 'O'), ('bartering', 'O'), ('220', 'O'), ('tonnes', 'O'), ('of', 'O'), ('grain', 'O')]"


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('15', 'O'), ('-', 'O'), ('christian', 'B-PER'), ('cullen,', 'I-PER'), ('14', 'O'), ('-', 'O'), ('jeff', 'B-PER'), ('wilson,', 'I-PER'), ('13', 'O'), ('-', 'O'), ('walter', 'B-PER'), ('little,', 'I-PER'), ('12', 'O'), ('-', 'O'), ('frank', 'B-PER'), ('bunce,', 'I-PER'), ('11', 'O'), ('-', 'O'), ('glen', 'B-PER'), ('osborne', 'I-PER')]"
1,"[('the', 'O'), ('catholic', 'B-ORG'), ('information', 'I-ORG'), ('office', 'I-ORG'), ('in', 'O'), ('nairobi', 'B-LOC'), ('said', 'O'), ('on', 'O'), ('monday', 'O'), ('that', 'O'), ('australian', 'B-MISC'), ('sisters', 'O'), ('moira', 'B-PER'), ('lynch,', 'I-PER'), ('73,', 'O'), ('and', 'O'), ('mary', 'B-PER'), ('batchelor,', 'I-PER'), ('68,', 'O'), ('american', 'B-MISC')]"


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('It', 'O'), ('explained', 'O'), ('the', 'O'), ('""', 'O'), ('addition', 'O'), ('of', 'O'), ('a', 'O'), ('New', 'B-LOC'), ('York', 'I-LOC'), ('license', 'O'), ('will', 'O'), ('enable', 'O'), ('Penn', 'B-ORG'), ('Treaty', 'I-ORG'), ('American', 'I-ORG'), ('Corp', 'I-ORG'), ('to', 'O'), ('conduct', 'O'), ('business', 'O'), ('in', 'O')]"


=== sshleifer/tiny-distilbert-base-cased ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('Compared', 'O'), ('with', 'O'), ('the', 'O'), ('end', 'O'), ('of', 'O'), ('last', 'O'), ('year,', 'O'), ('when', 'O'), ('T', 'B-ORG'), ('&', '[xIGNx]'), ('N', '[xIGNx]'), ('predicted', 'O'), ('a', 'O'), ('sluggish', 'O'), ('first', 'O'), ('half', 'O'), ('and', 'O'), ('a', 'O'), ('rebound', 'O'), ('later', 'O')]"
1,"[('The', 'O'), ('quake', 'O'), ('struck', 'O'), ('at', 'O'), ('11.', 'O'), ('16', '[xIGNx]'), ('a.', 'O'), ('m.', '[xIGNx]'), ('(', 'O'), ('1716', 'O'), ('GMT', 'B-MISC'), (')', 'O'), ('and', 'O'), ('was', 'O'), ('centred', 'O'), ('10', 'O'), ('miles', 'O'), ('(', 'O'), ('16', 'O'), ('km', 'O')]"


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('15', 'O'), ('-', 'O'), ('christian', 'B-PER'), ('cullen,', 'I-PER'), ('14', 'O'), ('-', 'O'), ('jeff', 'B-PER'), ('wilson,', 'I-PER'), ('13', 'O'), ('-', 'O'), ('walter', 'B-PER'), ('little,', 'I-PER'), ('12', 'O'), ('-', 'O'), ('frank', 'B-PER'), ('bunce,', 'I-PER'), ('11', 'O'), ('-', 'O'), ('glen', 'B-PER'), ('osborne', 'I-PER')]"
1,"[('sanchez', 'B-PER'), ('vicario,', 'I-PER'), ('runner', 'O'), ('-', '[xIGNx]'), ('up', '[xIGNx]'), ('to', 'O'), ('graf', 'B-PER'), ('at', 'O'), ('the', 'O'), ('french', 'B-MISC'), ('open', 'I-MISC'), ('and', 'O'), ('wimbledon,', 'B-MISC'), ('begins', 'O'), ('play', 'O'), ('against', 'O'), ('a', 'O'), ('qualifier', 'O'), ('in', 'O'), ('a', 'O')]"


=== huggingface/funnel-small-base ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('market', 'O'), ('talk', 'O'), ('-', 'O'), ('usda', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('august', 'O'), ('22,', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O'), ('and', 'O')]"
1,"[('in', 'O'), ('kansas', 'B-LOC'), ('city,', 'I-LOC'), ('jose', 'B-PER'), ('rosado', 'I-PER'), ('came', 'O'), ('within', 'O'), ('one', 'O'), ('out', 'O'), ('of', 'O'), ('his', 'O'), ('third', 'O'), ('complete', 'O'), ('game', 'O'), ('and', 'O'), ('michael', 'B-PER'), ('tucker', 'I-PER'), ('homered', 'O'), ('and', 'O'), ('drove', 'O')]"


=== sshleifer/tiny-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Police', 'O'), ('said', 'O'), ('the', 'O'), ('111', 'O'), ('passengers', 'O'), ('and', 'O'), ('six', 'O'), ('crew', 'O'), ('on', 'O'), ('board', 'O'), ('the', 'O'), ('ferry', 'O'), ('Trident', 'B-MISC'), ('Seven', 'I-MISC'), (',', 'O'), ('owned', 'O'), ('by', 'O'), ('France', 'B-LOC'), (""'s"", 'O'), ('Emeraud', 'B-ORG')]"


=== hf-internal-testing/tiny-layoutlm ===

architecture:	layoutlm
tokenizer:	LayoutLMTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('market', 'O'), ('talk', 'O'), ('-', 'O'), ('usda', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('august', 'O'), ('22,', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O'), ('and', 'O')]"
1,"[('the', 'O'), ('state', 'O'), ('of', 'O'), ('health', 'O'), ('of', 'O'), ('boris', 'B-PER'), ('yeltsin,', 'I-PER'), ('who', 'O'), ('had', 'O'), ('two', 'O'), ('heart', 'O'), ('attacks', 'O'), ('last', 'O'), ('year,', 'O'), ('has', 'O'), ('been', 'O'), ('the', 'O'), ('centre', 'O'), ('of', 'O'), ('media', 'O')]"


=== allenai/longformer-base-4096 ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Japan', 'B-LOC'), (""'s"", 'O'), ('Finance', 'O'), ('Minister', 'O'), ('Wataru', 'B-PER'), ('Kubo', 'I-PER'), ('told', 'O'), ('a', 'O'), ('news', 'O'), ('conference', 'O'), ('on', 'O'), ('Wednesday', 'O'), ('that', 'O'), ('he', 'O'), ('believes', 'O'), ('that', 'O'), ('the', 'O'), ('Bank', 'B-ORG'), ('of', 'I-ORG'), ('Japan', 'I-ORG')]"


=== microsoft/mpnet-base ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('compared', 'O'), ('with', 'O'), ('the', 'O'), ('end', 'O'), ('of', 'O'), ('last', 'O'), ('year,', 'O'), ('when', 'O'), ('t', 'B-ORG'), ('&', '[xIGNx]'), ('n', '[xIGNx]'), ('predicted', 'O'), ('a', 'O'), ('sluggish', 'O'), ('first', 'O'), ('half', 'O'), ('and', 'O'), ('a', 'O'), ('rebound', 'O'), ('later', 'O')]"
1,"[('armed', 'O'), ('hijackers', 'O'), ('believed', 'O'), ('to', 'O'), ('be', 'O'), ('iraqis', 'B-MISC'), ('released', 'O'), ('between', 'O'), ('60', 'O'), ('and', 'O'), ('70', 'O'), ('people', 'O'), ('on', 'O'), ('tuesday', 'O'), ('from', 'O'), ('a', 'O'), ('sudan', 'B-ORG'), ('airways', 'I-ORG'), ('plane', 'O'), ('carrying', 'O')]"


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('""', 'O'), ('We', 'O'), ('have', 'O'), ('no', 'O'), ('doubt', 'O'), ('that', 'O'), ('this', 'O'), ('is', 'O'), ('one', 'O'), ('of', 'O'), ('the', 'O'), ('rarest', 'O'), ('of', 'O'), ('the', 'O'), ('rare', 'O'), ('cases', 'O'), (',', 'O'), ('not', 'O'), ('merely', 'O'), ('due', 'O')]"
1,"[('The', 'O'), ('agreement', 'O'), ('resolved', 'O'), ('a', 'O'), ('dispute', 'O'), ('that', 'O'), ('arose', 'O'), ('in', 'O'), ('June', 'O'), ('when', 'O'), ('Colombia', 'B-LOC'), ('turned', 'O'), ('down', 'O'), ('American', 'B-ORG'), (""'s"", 'O'), ('request', 'O'), ('to', 'O'), ('operate', 'O'), ('flights', 'O'), ('between', 'O')]"


=== google/mobilebert-uncased ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('market', 'O'), ('talk', 'O'), ('-', 'O'), ('usda', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('august', 'O'), ('22,', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O'), ('and', 'O')]"
1,"[('a', 'O'), ('government', 'O'), ('statement,', 'O'), ('broadcast', 'O'), ('repeatedly', 'O'), ('by', 'O'), ('state', 'O'), ('radio,', 'O'), ('said', 'O'), ('the', 'O'), ('two', 'O'), ('days', 'O'), ('of', 'O'), ('prayer', 'O'), ('were', 'O'), ('""', 'O'), ('for', 'O'), ('the', 'O'), ('dead,', 'O'), ('for', 'O')]"


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Defenders', 'O'), ('-', 'O'), ('Frank', 'B-PER'), ('de', 'I-PER'), ('Boer', 'I-PER'), ('(', 'O'), ('Ajax', 'B-ORG'), (')', 'O'), (',', 'O'), ('John', 'B-PER'), ('Veldman', 'I-PER'), ('(', 'O'), ('Ajax', 'B-ORG'), (')', 'O'), (',', 'O'), ('Jaap', 'B-PER'), ('Stam', 'I-PER'), ('(', 'O'), ('PSV', 'B-ORG'), (')', 'O')]"


=== junnyu/roformer_chinese_sim_char_ft_small ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('15', 'O'), ('-', 'O'), ('christian', 'B-PER'), ('cullen,', 'I-PER'), ('14', 'O'), ('-', 'O'), ('jeff', 'B-PER'), ('wilson,', 'I-PER'), ('13', 'O'), ('-', 'O'), ('walter', 'B-PER'), ('little,', 'I-PER'), ('12', 'O'), ('-', 'O'), ('frank', 'B-PER'), ('bunce,', 'I-PER'), ('11', 'O'), ('-', 'O'), ('glen', 'B-PER'), ('osborne', 'I-PER')]"
1,"[('nearly', 'O'), ('every', 'O'), ('african', 'B-MISC'), ('member', 'O'), ('who', 'O'), ('spoke,', 'O'), ('as', 'O'), ('well', 'O'), ('as', 'O'), ('most', 'O'), ('security', 'B-ORG'), ('council', 'I-ORG'), ('members,', 'O'), ('however,', 'O'), ('were', 'O'), ('unsympathetic', 'O'), ('towards', 'O'), ('the', 'O'), ('government', 'O'), ('of', 'O')]"


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Shares', 'O'), ('in', 'O'), ('Slough', 'B-ORG'), (',', 'O'), ('which', 'O'), ('earlier', 'O'), ('announced', 'O'), ('a', 'O'), ('14', 'O'), ('percent', 'O'), ('rise', 'O'), ('in', 'O'), ('first-half', 'O'), ('pretax', 'O'), ('profit', 'O'), ('to', 'O'), ('37.4', 'O'), ('million', 'O'), ('stg', 'O'), (',', 'O')]"


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('market', 'O'), ('talk', 'O'), ('-', 'O'), ('usda', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('august', 'O'), ('22,', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O'), ('and', 'O')]"
1,"[('instead,', 'O'), ('the', 'O'), ('refugees', 'O'), ('were', 'O'), ('said', 'O'), ('to', 'O'), ('have', 'O'), ('been', 'O'), ('directed', 'O'), ('by', 'O'), ('their', 'O'), ('authorities', 'O'), ('to', 'O'), ('vote', 'O'), ('from', 'O'), ('strategic', 'O'), ('towns', 'O'), ('which', 'O'), ('had', 'O'), ('moslem', 'B-MISC')]"


=== xlm-roberta-base ===



Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('MARKET', 'O'), ('TALK', 'O'), ('-', 'O'), ('USDA', 'B-ORG'), ('net', 'O'), ('change', 'O'), ('in', 'O'), ('weekly', 'O'), ('export', 'O'), ('commitments', 'O'), ('for', 'O'), ('the', 'O'), ('week', 'O'), ('ended', 'O'), ('August', 'O'), ('22', 'O'), (',', 'O'), ('includes', 'O'), ('old', 'O'), ('crop', 'O')]"
1,"[('Innocent', 'B-PER'), ('Butare', 'I-PER'), (',', 'O'), ('executive', 'O'), ('secretary', 'O'), ('of', 'O'), ('the', 'O'), ('Rally', 'B-ORG'), ('for', 'I-ORG'), ('the', 'I-ORG'), ('Return', 'I-ORG'), ('of', 'I-ORG'), ('Refugees', 'I-ORG'), ('and', 'I-ORG'), ('Democracy', 'I-ORG'), ('in', 'I-ORG'), ('Rwanda', 'I-ORG'), ('(', 'O'), ('RDR', 'B-ORG'), (')', 'O')]"


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,word / target label
0,"[('15', 'O'), ('-', 'O'), ('Christian', 'B-PER'), ('Cullen', 'I-PER'), (',', 'O'), ('14', 'O'), ('-', 'O'), ('Jeff', 'B-PER'), ('Wilson', 'I-PER'), (',', 'O'), ('13', 'O'), ('-', 'O'), ('Walter', 'B-PER'), ('Little', 'I-PER'), (',', 'O'), ('12', 'O'), ('-', 'O'), ('Frank', 'B-PER'), ('Bunce', 'I-PER'), (',', 'O')]"
1,"[('Proud', 'O'), ('of', 'O'), ('its', 'O'), ('record', 'O'), ('in', 'O'), ('promptly', 'O'), ('joining', 'O'), ('both', 'O'), ('the', 'O'), ('Council', 'B-ORG'), ('of', 'I-ORG'), ('Europe', 'I-ORG'), ('and', 'O'), ('NATO', 'B-ORG'), (""'s"", 'O'), ('Partnership', 'B-MISC'), ('for', 'I-MISC'), ('Peace', 'I-MISC'), (',', 'O'), ('Ukraine', 'B-LOC')]"


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
2,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
3,camembert,CamembertTokenizerFast,camembert-base,PASSED,
4,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
5,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,
6,bert,BertTokenizerFast,sshleifer/tiny-distilbert-base-cased,PASSED,
7,electra,ElectraTokenizerFast,hf-internal-testing/tiny-electra,PASSED,
8,funnel,FunnelTokenizerFast,huggingface/funnel-small-base,PASSED,
9,gpt2,GPT2TokenizerFast,sshleifer/tiny-gpt2,PASSED,


## Summary

This module includes all the low, mid, and high-level API bits for token classification tasks data prep.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
