In [None]:
%load_ext autoreload
%autoreload 2

# Data

> The `data.token_classification` module contains the core bits required to use fast.ai's low-level and/or mid-level APIs to define `Datasets`, build `DataLoaders` for training transformers on token classification tasks (e.g., Named entity recognition (NER), Part-of-speech tagging (POS), etc...)

In [None]:
# |default_exp data.token_classification
# |default_cls_lvl 3

In [None]:
# |export
from __future__ import annotations

import gc, importlib, sys, traceback

from accelerate.logging import get_logger
from dataclasses import dataclass
from dotenv import load_dotenv
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.data.block import TransformBlock, Category, CategoryMap
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel, AutoModelForTokenClassification
from transformers import logging as hf_logging
from transformers.data.data_collator import DataCollatorWithPadding


from blurr.data.core import first_blurr_tfm, TextInput, TextCollatorWithPadding, BatchTokenizeTransform
from blurr.utils import get_hf_objects

In [None]:
# |hide
import pdb, nbdev

from datasets import concatenate_datasets, load_dataset, Dataset, Value
from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, FuncSplitter, MultiCategoryBlock
from fastai.data.transforms import DataLoader, DataLoaders, Datasets, ItemTransform
from fastai.losses import BaseLoss, BCEWithLogitsLossFlat
from fastai.text.data import SortedDL
from fastcore.test import *
from transformers import AutoConfig, AutoTokenizer

from blurr.data.core import *
from blurr.utils import *

In [None]:
# |export
# silence all the HF warnings and load environment variables
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()
logger = get_logger(__name__)

load_dotenv()

False

In [None]:
# |hide
# |notest
torch.cuda.set_device(0)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #0: NVIDIA GeForce RTX 3090


In [None]:
# | echo: false
os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

What we're running with at the time this documentation was generated:
torch: 1.13.1
fastai: 2.7.11
transformers: 4.26.1


## Setup

We'll use a subset of `conll2003` to demonstrate how to configure your blurr code for token classification

In [None]:
conll2003_dsd = load_dataset("conll2003")
conll2003_dsd

Found cached dataset conll2003 (/home/wgilliam/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

We need to get a list of the distinct entities we want to predict. If they are represented as list in their raw/readable form in another attribute/column in our dataset, we could use something like this to build a sorted list of distinct values as such: 

`label_names = sorted(list(set([lbls for sublist in germ_eval_df.labels.tolist() for lbls in sublist])))`

Fortunately, the `conll2003` dataset allows us to get at this list directly using the code below.

In [None]:
print(conll2003_dsd["train"].features["chunk_tags"].feature.names[:20])
print(conll2003_dsd["train"].features["ner_tags"].feature.names[:20])
print(conll2003_dsd["train"].features["pos_tags"].feature.names[:20])

['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP']
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS']


In [None]:
print(conll2003_dsd["train"][0]["tokens"])
print(conll2003_dsd["train"][0]["ner_tags"])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


In [None]:
label_names = conll2003_dsd["train"].features["ner_tags"].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

Let's prepare both a Hugging Face `Dataset`s and `DataFrame`s for illustrating how each can be used in BLURR

In [None]:
# build HF `Dataset` objects
train_ds = conll2003_dsd["train"].add_column("is_valid", [False] * len(conll2003_dsd["train"])).shuffle().select(range(1000))
valid_ds = conll2003_dsd["validation"].add_column("is_valid", [True] * len(conll2003_dsd["validation"])).shuffle().select(range(200))
conll2003_ds = concatenate_datasets([train_ds, valid_ds])

# build a `DataFrame` representation as well
conll2003_df = pd.DataFrame(conll2003_ds)

print(len(train_ds), len(valid_ds))
print(len(conll2003_df[conll2003_df["is_valid"] == False]), len(conll2003_df[conll2003_df["is_valid"] == True]))
conll2003_df.head()

1000 200
1000 200


Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags,is_valid
0,7744,"[LONDON, 1996-08-26]","[22, 11]","[11, 12]","[5, 0]",False
1,8385,[15-6],[11],[11],[0],False
2,545,"[Nyva, Vinnytsya, (, Ukraine, ), 1, Tallinna, Sadam, (, Estonia, ), 0, (, 0-0, )]","[22, 22, 4, 22, 5, 11, 22, 22, 4, 22, 5, 11, 4, 11, 5]","[11, 12, 0, 11, 0, 11, 12, 12, 0, 11, 0, 11, 0, 11, 0]","[3, 4, 0, 5, 0, 0, 3, 4, 0, 5, 0, 0, 0, 0, 0]",False
3,12204,"[Radio, stations, said, around, 15,000, farmers, ,, angered, by, a, fall, in, beef, prices, following, the, mad, cow, disease, crisis, ,, staged, protests, in, many, areas, and, blockaded, several, main, roads, and, motorways, .]","[21, 24, 38, 15, 11, 24, 6, 38, 15, 12, 21, 15, 21, 24, 39, 12, 16, 21, 21, 21, 6, 38, 24, 15, 16, 24, 10, 38, 16, 16, 24, 10, 24, 7]","[11, 12, 21, 11, 12, 12, 0, 21, 13, 11, 12, 13, 11, 12, 13, 11, 12, 12, 12, 12, 0, 21, 11, 13, 11, 12, 0, 21, 11, 12, 12, 0, 11, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",False
4,11064,"[to, disqualify, the, national, team, of, Burundi, from, the, 21st, African, Cup, of, Nations, ...]","[35, 37, 12, 16, 21, 15, 22, 15, 12, 16, 16, 22, 15, 23, 8]","[21, 22, 11, 12, 12, 13, 11, 13, 11, 12, 12, 12, 13, 11, 0]","[0, 0, 0, 0, 0, 0, 5, 0, 0, 7, 8, 8, 8, 8, 0]",False


## Base API

### Labeling Strategies

In [None]:
# |export
class BaseLabelingStrategy:
    def __init__(
        self,
        hf_tokenizer: PreTrainedTokenizerBase,
        label_names: Optional[List[str]],
        non_entity_label: str = "O",
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
    ) -> None:
        self.hf_tokenizer = hf_tokenizer
        self.ignore_token_id = ignore_token_id
        self.label_names = label_names
        self.non_entity_label = non_entity_label

    def align_labels_with_tokens(self, word_ids, word_labels):
        raise NotImplementedError()

Here we include a `BaseLabelingStrategy` abstract class and several different strategies for assigning labels to your tokenized inputs. The "only first token" and "B/I" labeling strategies are discussed in the ["Token Classification"](https://huggingface.co/course/chapter7/2?fw=pt) section in part 7 of the Hugging Face's Transformers course.

In [None]:
# |export
class OnlyFirstTokenLabelingStrategy(BaseLabelingStrategy):
    """
    Only the first token of word is associated with the label (all other subtokens with the `ignore_index_id`). Works where labels
    are Ids or strings (in the later case we'll use the `label_names` to look up it's Id)
    """

    def align_labels_with_tokens(self, word_ids, word_labels):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # start of a new word
                current_word = word_id
                label = self.ignore_token_id if word_id is None else word_labels[word_id]
                new_labels.append(label if isinstance(label, int) else self.label_names.index(label))
            else:
                # special token or another subtoken of current word
                new_labels.append(self.ignore_token_id)

        return new_labels


class SameLabelLabelingStrategy(BaseLabelingStrategy):
    """
    Every token associated with a given word is associated with the word's label. Works where labels
    are Ids or strings (in the later case we'll use the `label_names` to look up it's Id)
    """

    def align_labels_with_tokens(self, word_ids, word_labels):
        new_labels = []
        for word_id in word_ids:
            if word_id == None:
                new_labels.append(self.ignore_token_id)
            else:
                label = word_labels[word_id]
                new_labels.append(label if isinstance(label, int) else self.label_names.index(label))

        return new_labels


class BILabelingStrategy(BaseLabelingStrategy):
    """
    If using B/I labels, the first token assoicated to a given word gets the "B" label while all other tokens related
    to that same word get "I" labels.  If "I" labels don't exist, this strategy behaves like the `OnlyFirstTokenLabelingStrategy`.
    Works where labels are Ids or strings (in the later case we'll use the `label_names` to look up it's Id)
    """

    def align_labels_with_tokens(self, word_ids, word_labels):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # start of a new word
                current_word = word_id
                label = self.ignore_token_id if word_id is None else word_labels[word_id]
                new_labels.append(label if isinstance(label, int) else self.label_names.index(label))
            elif word_id is None:
                # special token
                new_labels.append(self.ignore_token_id)
            else:
                # we're in the same word
                label = word_labels[word_id]
                label_name = self.label_names[label] if isinstance(label, int) else label

                # append the I-{ENTITY} if it exists in `labels`, else default to the `same_label` strategy
                iLabel = f"I-{label_name[2:]}"
                new_labels.append(
                    self.label_names.index(iLabel) if iLabel in self.label_names else self.label_names.index(self.non_entity_label)
                )

        return new_labels

### Task specific functions

The below functions provide a basic way to fetch your Hugging Face objects and pretokenize your inputs.

In [None]:
# |export
def get_task_hf_objects(
    pretrained_model_name: str,
    label_names: list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"],
    verbose: bool = False,
):
    model_cls = AutoModelForTokenClassification
    n_labels = len(label_names)

    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
        pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
    )

    if verbose:
        hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

        print("=== config ===")
        print(f"# of labels:\t{hf_config.num_labels}")
        print("")
        print("=== tokenizer ===")
        print(f"Vocab size:\t\t{hf_tokenizer.vocab_size}")
        print(f"Max # of tokens:\t{hf_tokenizer.model_max_length}")
        print(f"Attributes expected by model in forward pass:\t{hf_tokenizer.model_input_names}")

    return hf_arch, hf_config, hf_tokenizer, hf_model

In [None]:
nbdev.show_doc(get_task_hf_objects)

---

[source](https://github.com/ohmeow/blurr/blob/dev-3.0.0 #master/blurr/data/token_classification.py#L130){target="_blank" style="float:right; font-size:smaller"}

### get_task_hf_objects

>      get_task_hf_objects (pretrained_model_name:str, label_names:list=['O',
>                           'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC',
>                           'I-LOC', 'B-MISC', 'I-MISC'], verbose:bool=False)

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names=label_names, verbose=False)

test_eq(hf_arch, "deberta_v2")
test_eq(hf_config.num_labels, 9)

In [None]:
# |export
# tokenize the dataset
def tokenclass_tokenize_func(
    examples,
    hf_tokenizer: PreTrainedTokenizerBase,
    labeling_strategy: BaseLabelingStrategy,
    words_attr: str = "words",
    word_labels_attr: str = "labels",
    max_length: int = None,
    padding: bool | str = True,
    truncation: bool | str = True,
    tok_kwargs: dict = {},
):
    inputs = hf_tokenizer(
        examples[words_attr], max_length=max_length, padding=padding, truncation=truncation, is_split_into_words=True, **tok_kwargs
    )

    all_labels = examples[word_labels_attr]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = inputs.word_ids(i)
        new_labels.append(labeling_strategy.align_labels_with_tokens(word_ids, labels))

    inputs["label"] = new_labels
    return inputs

In [None]:
nbdev.show_doc(tokenclass_tokenize_func)

---

[source](https://github.com/ohmeow/blurr/blob/dev-3.0.0 #master/blurr/data/token_classification.py#L157){target="_blank" style="float:right; font-size:smaller"}

### tokenclass_tokenize_func

>      tokenclass_tokenize_func (examples, hf_tokenizer:transformers.tokenizatio
>                                n_utils_base.PreTrainedTokenizerBase, labeling_
>                                strategy:__main__.BaseLabelingStrategy,
>                                words_attr:str='words',
>                                word_labels_attr:str='labels',
>                                max_length:int=None, padding:bool|str=True,
>                                truncation:bool|str=True, tok_kwargs:dict={})

In [None]:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
my_dict = {
    "id": [0, 1],
    "words": [
        ["Wayde", "runs", "ohmeow.com", "from", "California"],
        ["Bayern", "Munich", "is", "the", "greatest", "footabll", "team", "of", "all", "time" "!"],
    ],
    "labels": [[1, 0, 3, 0, 5], [3, 4, 0, 0, 0, 7, 8, 0, 0, 0, 0]],
}
test_ds = Dataset.from_dict(my_dict)

# define our subword tokenized labeling strategy
labeling_strat = BILabelingStrategy(hf_tokenizer=hf_tokenizer, label_names=label_names)
tokenize_func = partial(tokenclass_tokenize_func, hf_tokenizer=hf_tokenizer, labeling_strategy=labeling_strat)
proc_test_ds = test_ds.map(tokenize_func, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

### Reconstructing inputs/labels

The utility methods below allow blurr users to reconstruct the original word/label associations from the input_ids/label associations. For example, these are used in our token classification `show_batch` method below.

In [None]:
# |export
def get_token_labels_from_input_ids(
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # List of input_ids for the tokens in a single piece of processed text
    input_ids: List[int],
    # List of label indexs for each token
    token_label_ids: List[int],
    # List of label names from witch the `label` indicies can be used to find the name of the label
    vocab: List[str],
    # The token ID that should be ignored when calculating the loss
    ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
    # The token used to identifiy ignored tokens (default: [xIGNx])
    ignore_token: str = "[xIGNx]",
) -> List[Tuple[str, str]]:
    """
    Given a list of input IDs, the label ID associated to each, and the labels vocab, this method will return a list of tuples whereby
    each tuple defines the "token" and its label name. For example:
    [('ĠWay', B-PER), ('de', B-PER), ('ĠGill', I-PER), ('iam', I-PER), ('Ġloves'), ('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG)]
    """
    # convert ids to tokens
    toks = hf_tokenizer.convert_ids_to_tokens(input_ids)
    # align "tokens" with labels
    tok_labels = [
        (tok, ignore_token if label_id == ignore_token_id else vocab[label_id])
        for tok_id, tok, label_id in zip(input_ids, toks, token_label_ids)
        if tok_id not in hf_tokenizer.all_special_ids
    ]
    return tok_labels

In [None]:
nbdev.show_doc(get_token_labels_from_input_ids)

---

[source](https://github.com/ohmeow/blurr/blob/dev-3.0.0 #master/blurr/data/token_classification.py#L180){target="_blank" style="float:right; font-size:smaller"}

### get_token_labels_from_input_ids

>      get_token_labels_from_input_ids (hf_tokenizer:transformers.tokenization_u
>                                       tils_base.PreTrainedTokenizerBase,
>                                       input_ids:List[int],
>                                       token_label_ids:List[int],
>                                       vocab:List[str],
>                                       ignore_token_id:int=-100,
>                                       ignore_token:str='[xIGNx]')

Given a list of input IDs, the label ID associated to each, and the labels vocab, this method will return a list of tuples whereby
each tuple defines the "token" and its label name. For example:
[('ĠWay', B-PER), ('de', B-PER), ('ĠGill', I-PER), ('iam', I-PER), ('Ġloves'), ('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG)]

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| hf_tokenizer | PreTrainedTokenizerBase |  | A Hugging Face tokenizer |
| input_ids | List[int] |  | List of input_ids for the tokens in a single piece of processed text |
| token_label_ids | List[int] |  | List of label indexs for each token |
| vocab | List[str] |  | List of label names from witch the `label` indicies can be used to find the name of the label |
| ignore_token_id | int | -100 | The token ID that should be ignored when calculating the loss |
| ignore_token | str | [xIGNx] | The token used to identifiy ignored tokens (default: [xIGNx]) |
| **Returns** | **List[Tuple[str, str]]** |  |  |

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


In [None]:
# TESTS for align_labels_with_tokens()
my_dict = {
    "id": [0, 1],
    "words": [
        ["Wayde", "runs", "ohmeow.com", "from", "California"],
        ["Bayern", "Munich", "is", "the", "greatest", "footabll", "team", "of", "all", "time" "!"],
    ],
    "labels": [[1, 0, 3, 0, 5], [3, 4, 0, 0, 0, 7, 8, 0, 0, 0, 0]],
}
test_ds = Dataset.from_dict(my_dict)
test_df = pd.DataFrame(test_ds)

for idx in range(len(test_df)):
    raw_word_list = test_df.iloc[idx]["words"]
    raw_label_list = test_df.iloc[idx]["labels"]

    be = hf_tokenizer(raw_word_list, is_split_into_words=True)
    input_ids = be["input_ids"]
    targ_ids = [-100 if (word_id == None) else raw_label_list[word_id] for word_id in be.word_ids()]

    tok_labels = get_token_labels_from_input_ids(hf_tokenizer, input_ids, targ_ids, label_names)

    for tok_label, targ_id in zip(tok_labels, [label_id for label_id in targ_ids if label_id != -100]):
        test_eq(tok_label[1], label_names[targ_id])

In [None]:
# |export
def get_word_labels_from_token_labels(
    hf_arch: str,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...])
    tok_labels,
) -> List[Tuple[str, str]]:
    """
    Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the
    "word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method,
    allows the user to reconstruct the orginal raw inputs and labels.
    """
    # recreate raw words list (we assume for token classification that the input is a list of words)
    words = hf_tokenizer.convert_tokens_to_string([tok_label[0] for tok_label in tok_labels]).split()

    if hf_arch == "canine":
        word_list = [f"{word} " for word in words]
    else:
        word_list = [word for word in words]

    # align "words" with labels
    word_labels, idx = [], 0
    for word in word_list:
        word_labels.append((word, tok_labels[idx][1]))
        idx += len(hf_tokenizer.tokenize(word))

    return word_labels

In [None]:
nbdev.show_doc(get_word_labels_from_token_labels)

---

[source](https://github.com/ohmeow/blurr/blob/dev-3.0.0 #master/blurr/data/token_classification.py#L210){target="_blank" style="float:right; font-size:smaller"}

### get_word_labels_from_token_labels

>      get_word_labels_from_token_labels (hf_arch:str, hf_tokenizer:transformers
>                                         .tokenization_utils_base.PreTrainedTok
>                                         enizerBase, tok_labels)

Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the
"word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method,
allows the user to reconstruct the orginal raw inputs and labels.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| hf_arch | str |  |
| hf_tokenizer | PreTrainedTokenizerBase | A Hugging Face tokenizer |
| tok_labels |  | A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...]) |
| **Returns** | **List[Tuple[str, str]]** |  |

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


In [None]:
# TESTS for align_labels_with_words()
my_dict = {
    "id": [0, 1],
    "words": [
        ["Wayde", "runs", "ohmeow.com", "from", "California"],
        ["Bayern", "Munich", "is", "the", "greatest", "footabll", "team", "of", "all", "time" "!"],
    ],
    "labels": [[1, 0, 3, 0, 5], [3, 4, 0, 0, 0, 7, 8, 0, 0, 0, 0]],
}
test_ds = Dataset.from_dict(my_dict)
test_df = pd.DataFrame(test_ds)

for idx in range(len(test_df)):
    raw_word_list = test_df.iloc[idx]["words"]
    raw_label_list = test_df.iloc[idx]["labels"]

    be = hf_tokenizer(raw_word_list, is_split_into_words=True)
    input_ids = be["input_ids"]
    targ_ids = [-100 if (word_id == None) else raw_label_list[word_id] for word_id in be.word_ids()]

    tok_labels = get_token_labels_from_input_ids(hf_tokenizer, input_ids, targ_ids, label_names)
    word_labels = get_word_labels_from_token_labels(hf_arch, hf_tokenizer, tok_labels)

    for word_label, raw_word, raw_label_id in zip(word_labels, raw_word_list, raw_label_list):
        test_eq(word_label[0], raw_word)
        test_eq(word_label[1], label_names[raw_label_id])

### `TokenClassTextCollatorWithPadding` -

In [None]:
# |export
@dataclass
class TokenClassTextCollatorWithPadding(TextCollatorWithPadding):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str = None,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig = None,
        # A Hugging Face model
        hf_model: PreTrainedModel = None,
        # The number of inputs expected by your model
        n_inp: int = 1,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator_cls: type = DataCollatorWithPadding,
        # kwyargs specific for the instantiation of the `data_collator`
        data_collator_kwargs: dict = {},
    ):
        self.ignore_token_id = ignore_token_id

        super().__init__(
            hf_tokenizer=hf_tokenizer,
            hf_arch=hf_arch,
            hf_config=hf_config,
            hf_model=hf_model,
            n_inp=n_inp,
            data_collator_cls=data_collator_cls,
            data_collator_kwargs=data_collator_kwargs,
        )

    # used to give the labels/targets the right shape
    def _proc_targets(self, inputs_d, targs):
        # the code below comes pretty much straight from the `DataCollatorForTokenClassification` class
        max_seq_length = np.max([len(input_ids) for input_ids in inputs_d["input_ids"]])
        padding_side = self.hf_tokenizer.padding_side

        if padding_side == "right":
            targs = [
                (list(trg.numpy()) if torch.is_tensor(trg) else trg) + [self.ignore_token_id] * (max_seq_length - len(trg)) for trg in targs
            ]
        else:
            targs = [
                [self.ignore_token_id] * (max_seq_length - len(trg)) + (list(trg.numpy()) if torch.is_tensor(trg) else trg) for trg in targs
            ]

        if is_listy(targs[0]):
            targs = torch.stack([tensor(lbls) for lbls in targs])
        elif isinstance(targs[0], torch.Tensor) and len(targs[0].size()) > 0:
            targs = torch.stack(targs)
        else:
            targs = torch.tensor(targs)

        return targs

## Base API: Examples

This section demonstrates how you can use standard `Dataset` objects (PyTorch and Hugging Face) to build PyTorch `DataLoader`s

**Note** that most fast.ai specific features such as `DataLoaders.one_batch` and `DataLoader.show_batch` are not available when using PyTorch.

### PyTorch

#### Step 1: HF objects

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


#### Step 2: `Dataset`s (PyTorch)

In [None]:
# define our subword tokenized labeling strategy
labeling_strat = BILabelingStrategy(hf_tokenizer=hf_tokenizer, label_names=label_names)

tokenize_func = partial(
    tokenclass_tokenize_func, hf_tokenizer=hf_tokenizer, labeling_strategy=labeling_strat, words_attr="tokens", word_labels_attr="ner_tags"
)
proc_train_ds = train_ds.map(tokenize_func, batched=True, remove_columns=train_ds.column_names)
proc_valid_ds = valid_ds.map(tokenize_func, batched=True, remove_columns=valid_ds.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# define our PyTorch Dataset class
class HFTokenClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, hf_tokenizer):
        self.hf_dataset = hf_dataset
        self.hf_tokenizer = hf_tokenizer

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return item


# build our PyTorch training and validation Datasets
pt_proc_train_ds = HFTokenClassificationDataset(proc_train_ds, hf_tokenizer=hf_tokenizer)
pt_proc_valid_ds = HFTokenClassificationDataset(proc_valid_ds, hf_tokenizer=hf_tokenizer)

#### Step 3: `DataLoader`s  (PyTorch)

In [None]:
# build your fastai `DataLoaders` from Pytorch `DataLoader` objects
batch_size = 4
data_collator = TokenClassTextCollatorWithPadding(hf_tokenizer)
train_dl = torch.utils.data.DataLoader(pt_proc_train_ds, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
valid_dl = torch.utils.data.DataLoader(pt_proc_valid_ds, batch_size=batch_size * 2, shuffle=False, collate_fn=data_collator)

dls = DataLoaders(train_dl, valid_dl)

In [None]:
print("# of batches in train|validation dataloaders:", len(train_dl), len(valid_dl))

b = next(iter(train_dl))
print("# of items in each batch:", len(b))
print("")
print(f"Decoded input_ids: {hf_tokenizer.decode(b[0]['input_ids'][0][:10])} ... ")
print("Targets:", b[1])

# b

# of batches in train|validation dataloaders: 250 25
# of items in each batch: 2

Decoded input_ids: [CLS] The company said it was not aware of any ... 
Targets: tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100],
        [-100,    0,    0,    0,    5,    0,    0,    0, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
      

In [None]:
# NOPE: Won't work with PyTorch DataLoaders
# AttributeError: 'DataLoader' object has no attribute 'show_batch'
# dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)

In [None]:
# |echo:false
try:
    del dls, hf_model
except:
    pass
finally:
    clean_memory()

### Hugging Face

#### Step 1: HF objects

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


#### Step 2: `Datasets` (huggingface)

In [None]:
# define our subword tokenized labeling strategy
labeling_strat = BILabelingStrategy(hf_tokenizer=hf_tokenizer, label_names=label_names)

tokenize_func = partial(
    tokenclass_tokenize_func, hf_tokenizer=hf_tokenizer, labeling_strategy=labeling_strat, words_attr="tokens", word_labels_attr="ner_tags"
)
proc_train_ds = train_ds.map(tokenize_func, batched=True, remove_columns=train_ds.column_names)
proc_valid_ds = valid_ds.map(tokenize_func, batched=True, remove_columns=valid_ds.column_names)

print(proc_train_ds)
print(proc_valid_ds)

Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-96658205b11492a5.arrow


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 1000
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 200
})


#### Step 3: `DataLoader`s (PyTorch)

In [None]:
# build your fastai `DataLoaders` from Pytorch `DataLoader` objects
batch_size = 4
data_collator = TokenClassTextCollatorWithPadding(hf_tokenizer)
train_dl = torch.utils.data.DataLoader(proc_train_ds, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
valid_dl = torch.utils.data.DataLoader(proc_valid_ds, batch_size=batch_size * 2, shuffle=False, collate_fn=data_collator)

dls = DataLoaders(train_dl, valid_dl)

In [None]:
print("# of batches in train|validation dataloaders:", len(train_dl), len(valid_dl))

b = next(iter(train_dl))
print("# of items in each batch:", len(b))
print("")
print(f"Decoded input_ids: {hf_tokenizer.decode(b[0]['input_ids'][0][:10])} ... ")
print("Targets:", b[1])

# b

# of batches in train|validation dataloaders: 250 25
# of items in each batch: 2

Decoded input_ids: [CLS] Quigley, a former medallist in the points event ... 
Targets: tensor([[-100,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100],
        [-100,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0,
            7,    8,    8,    0,    0,    0,    0,    0, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [None]:
# |echo:false
try:
    del dls, hf_model
except:
    pass
finally:
    clean_memory()

## Low-Level API

This section demonstrates how you can migrate from using PyTorch/Hugging Face to fast.ai `Datasets` and `DataLoaders` to recapture much of the fast.ai specific features unavailable when using basic PyTorch. This includes:

- `DataLoaders.one_batch()`
- `DataLoaders.show_batch()`
- `Leaner.export()`

### `TokenClassTextInput` -

In [None]:
# |export
class TokenClassTextInput(TextInput):
    pass

Again, we define a custom class, `TokenClassTextInput`, for the `@typedispatched` methods to use so that we can override how token classification inputs/targets are assembled, as well as, how the data is shown via methods like `show_batch` and `show_results`.

### `show_batch` -

In [None]:
# |export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `TokenClassTextInput` typed inputs
    x: TokenClassTextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_arch, hf_tokenizer = tfm.hf_arch, tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["label_names"] if ("label_names" in tfm.kwargs) else None
    if trg_labels is None and dataloaders.vocab is not None:
        trg_labels = dataloaders.vocab

    res = L()
    n_inp = dataloaders.n_inp

    n_samples = min(max_n, dataloaders.bs)
    for idx in range(n_samples):
        input_ids = x[idx]
        trgs = y[idx]
        sample = samples[idx] if samples is not None else None

        # align "tokens" with labels
        tok_labels = get_token_labels_from_input_ids(hf_tokenizer, input_ids, trgs, trg_labels)
        # align "words" with labels
        word_labels = get_word_labels_from_token_labels(hf_arch, hf_tokenizer, tok_labels)
        # stringify list of (word,label) for example
        res.append([f"{[ word_targ for idx, word_targ in enumerate(word_labels) if (trunc_at is None or idx < trunc_at) ]}"])

    display_df(pd.DataFrame(res, columns=["word / target label"])[:max_n])
    return ctxs

## Low-Level API: Examples

### Using fast.ai `Datasets` and `DataLoaders`

#### Step 1: HF objects

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


#### Step 2: `Datasets` (fast.ai)

In [None]:
# define our subword tokenized labeling strategy
labeling_strat = BILabelingStrategy(hf_tokenizer=hf_tokenizer, label_names=label_names)
tokenize_func = partial(
    tokenclass_tokenize_func, hf_tokenizer=hf_tokenizer, labeling_strategy=labeling_strat, words_attr="tokens", word_labels_attr="ner_tags"
)
proc_conll2003_ds = conll2003_ds.map(tokenize_func, batched=True)

# turn Arrow into DataFrame (`ColSplitter` only works with `DataFrame`s)
train_df = pd.DataFrame(proc_conll2003_ds)
train_df.head()

# define dataset splitter
splitter = ColSplitter("is_valid")
splits = splitter(train_df)


# define how we want to build our inputs and targets
def _build_inputs(example):
    return {fwd_arg_name: example[fwd_arg_name] for fwd_arg_name in hf_tokenizer.model_input_names if fwd_arg_name in list(example.keys())}


def _build_targets(example):
    return example["label"]


# create our fastai `Datasets` object
dsets = Datasets(items=train_df, splits=splits, tfms=[[_build_inputs], _build_targets], n_inp=1)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [None]:
print("Items in train|validation datasets: ", len(dsets.train), len(dsets.valid))

example = dsets.valid[0]
# example

print(f"Items in each example: {len(example)}")
print(f"Example inputs: {list(example[0].keys())}")
print(f"Example target(s): {example[1]}")

Items in train|validation datasets:  1000 200
Items in each example: 2
Example inputs: ['input_ids', 'token_type_ids', 'attention_mask']
Example target(s): [-100, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


#### Step 3: `DataLoaders` (fast.ai)

In [None]:
data_collator = TokenClassTextCollatorWithPadding(hf_tokenizer)
sort_func = partial(sorted_dl_func, hf_tokenizer=hf_tokenizer)
batch_decode_tfm = BatchDecodeTransform(
    hf_tokenizer, hf_arch, hf_config, hf_model, label_names=label_names, input_return_type=TokenClassTextInput
)

dls = dsets.dataloaders(
    batch_size=4,
    create_batch=data_collator,
    after_batch=batch_decode_tfm,
    dl_type=partial(SortedDL, sort_func=sort_func),
)

In [None]:
print("# of batches in train|validation dataloaders:", len(train_dl), len(valid_dl))

b = next(iter(train_dl))
print("# of items in each batch:", len(b))
print("")
print(f"Decoded input_ids: {hf_tokenizer.decode(b[0]['input_ids'][0][:10])} ... ")
print("Targets:", b[1])

# b

# of batches in train|validation dataloaders: 250 25
# of items in each batch: 2

Decoded input_ids: [CLS] Boston 7 CALIFORNIA 4[SEP][PAD][PAD][PAD][PAD] ... 
Targets: tensor([[-100,    3,    0,    3,    0, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,


In [None]:
dls.show_batch(dataloaders=dls, max_n=4)

Unnamed: 0,word / target label
0,"[('LONDON', 'B-LOC'), ('1996-08-26', 'O')]"
1,"[('soccer', 'O'), ('matches', 'O'), ('played', 'O'), ('on', 'O'), ('Sunday', 'O'), (':', 'O')]"
2,"[('Bolton', 'B-ORG'), ('3', 'O'), ('2', 'O'), ('1', 'O'), ('0', 'O'), ('5', 'O'), ('2', 'O'), ('7', 'O')]"
3,"[('Larry', 'B-PER'), ('Fine', 'I-PER')]"


In [None]:
# |echo:false
try:
    del dls, hf_model
except:
    pass
finally:
    clean_memory()

## Mid-Level API

BLURR's mid-level API provides a way to build your `DataLoaders` using fast.ai's mid-level `DataBlock` API.  

For token classification tasks, BLURR supports **two** ways of doing this in the mid-level API: 

1. Using pre-tokenized data (the traditional approach)

2. batch-time tokenization (the default approach in previous versions of blurr)

### Targets -

#### `TokenTensorCategory` -

In [None]:
# |export
class TokenTensorCategory(TensorBase):
    pass

#### `TokenCategorize` -

In [None]:
# |export
class TokenCategorize(Transform):
    """Reversible transform of a list of category string to `vocab` id"""

    def __init__(
        self,
        # The unique list of entities (e.g., B-LOC) (default: CategoryMap(vocab))
        vocab: List[str] = None,
        # The token used to identifiy ignored tokens (default: xIGNx)
        ignore_token: str = "[xIGNx]",
        # The token ID that should be ignored when calculating the loss (default: CrossEntropyLossFlat().ignore_index)
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
    ):
        self.vocab = None if vocab is None else CategoryMap(vocab, sort=False)
        self.ignore_token, self.ignore_token_id = ignore_token, ignore_token_id

        self.loss_func, self.order = CrossEntropyLossFlat(ignore_index=self.ignore_token_id), 1

    def setups(self, dsets):
        if self.vocab is None and dsets is not None:
            self.vocab = CategoryMap(dsets)
        self.c = len(self.vocab)

    def encodes(self, labels):
        # if `val` is the label name (e.g., B-PER, I-PER, etc...), lookup the corresponding index in the vocab using
        # `self.vocab.o2i`
        ids = [val if (isinstance(val, int)) else self.vocab.o2i[val] for val in labels]
        return TokenTensorCategory(ids)

    def decodes(self, encoded_labels):
        return Category([(self.vocab[lbl_id]) for lbl_id in encoded_labels if lbl_id != self.ignore_token_id])

`TokenCategorize` modifies the fastai `Categorize` transform in a couple of ways.

First, it allows your targets to consist of a `Category` *per* token, and second, it uses the idea of an `ignore_token_id` to mask subtokens that don't need a prediction. For example, the target of special tokens (e.g., pad, cls, sep) are set to `ignore_token_id` as are subsequent sub-tokens of a given token should more than 1 sub-token make it up.

#### `TokenCategoryBlock` -

In [None]:
# |export
def TokenCategoryBlock(
    # The unique list of entities (e.g., B-LOC) (default: CategoryMap(vocab))
    vocab: Optional[List[str]] = None,
    # The token used to identifiy ignored tokens (default: xIGNx)
    ignore_token: str = "[xIGNx]",
    # The token ID that should be ignored when calculating the loss (default: CrossEntropyLossFlat().ignore_index)
    ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
):
    """`TransformBlock` for per-token categorical targets"""
    return TransformBlock(type_tfms=TokenCategorize(vocab=vocab, ignore_token=ignore_token, ignore_token_id=ignore_token_id))

In [None]:
nbdev.show_doc(TokenCategoryBlock, title_level=3)

---

[source](https://github.com/ohmeow/blurr/blob/dev-3.0.0 #master/blurr/data/token_classification.py#L386){target="_blank" style="float:right; font-size:smaller"}

### TokenCategoryBlock

>      TokenCategoryBlock (vocab:Optional[List[str]]=None,
>                          ignore_token:str='[xIGNx]', ignore_token_id:int=-100)

`TransformBlock` for per-token categorical targets

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| vocab | Optional[List[str]] | None | The unique list of entities (e.g., B-LOC) (default: CategoryMap(vocab)) |
| ignore_token | str | [xIGNx] | The token used to identifiy ignored tokens (default: xIGNx) |
| ignore_token_id | int | -100 | The token ID that should be ignored when calculating the loss (default: CrossEntropyLossFlat().ignore_index) |

### Inputs -

#### `TokenClassBatchTokenizeTransform` - 

In [None]:
# |export
class TokenClassBatchTokenizeTransform(BatchTokenizeTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The labeling strategy you want to apply when associating labels with word tokens
        labeling_strategy_cls: BaseLabelingStrategy = BILabelingStrategy,
        # the target label names
        target_label_names: Optional[List[str]] = None,
        # the label for non-entity
        non_entity_label: str = "O",
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: Optional[int] = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = True,
        # If using a slow tokenizer, users will need to prove a `slow_word_ids_func` that accepts a
        # tokenizzer, example index, and a batch encoding as arguments and in turn returnes the
        # equavlient of fast tokenizer's `word_ids``
        slow_word_ids_func: Optional[Callable] = None,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `TokenClassBatchTokenizeTransform`
        **kwargs,
    ):

        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            include_labels=include_labels,
            ignore_token_id=ignore_token_id,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=is_split_into_words,
            tok_kwargs=tok_kwargs,
            **kwargs,
        )

        self.target_label_names = target_label_names
        self.non_entity_label = non_entity_label
        self.slow_word_ids_func = slow_word_ids_func

        self.labeling_strategy = labeling_strategy_cls(
            hf_tokenizer, label_names=self.target_label_names, non_entity_label=self.non_entity_label, ignore_token_id=ignore_token_id
        )

    def encodes(self, samples, return_batch_encoding=False):
        encoded_samples, inputs = super().encodes(samples, return_batch_encoding=True)

        # if there are no targets (e.g., when used for inference)
        if len(encoded_samples[0]) == 1:
            return encoded_samples

        # get the type of our targets (by default will be TokenTensorCategory)
        target_cls = type(encoded_samples[0][1])

        updated_samples = []
        for idx, s in enumerate(encoded_samples):
            # with batch-time tokenization, we have to align each token with the correct label using the `word_ids` in the
            # batch encoding object we get from calling our *fast* tokenizer
            word_ids = inputs.word_ids(idx) if self.hf_tokenizer.is_fast else self.slow_word_ids_func(self.hf_tokenizer, idx, inputs)
            targ_ids = target_cls(self.labeling_strategy.align_labels_with_tokens(word_ids, s[-1].tolist()))

            if self.include_labels and len(targ_ids) > 0:
                s[0]["label"] = targ_ids

            updated_samples.append((s[0], targ_ids))

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples

`TokenClassBatchTokenizeTransform` is used to exclude any of the target's tokens we don't want to include in the loss calcuation (e.g. padding, cls, sep, etc...).

Note also that we default `is_split_into_words = True` since token classification tasks expect a list of words and labels for each word.

## Mid-Level API: Examples

### Pretokenized

#### Step 1: HF objects

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


####  Step 2: `DataBlock`

In [None]:
# define DataBlock splitter
def _split_func(example):
    return example["is_valid"] == True


splitter = FuncSplitter(_split_func)


# define how we want to build our targets
# note: we don't need to define how to build our inputs because we're using an HF `Dataset` in this example
def get_y(example):
    return example["label"]


# define the DataBlock
data_collator = TokenClassTextCollatorWithPadding(hf_tokenizer)

txt_block = TextBlock(
    hf_arch=hf_arch,
    hf_config=hf_config,
    hf_tokenizer=hf_tokenizer,
    hf_model=hf_model,
    input_return_type=TokenClassTextInput,
    data_collator=data_collator,
    batch_decode_kwargs={"label_names": label_names},
)

blocks = (txt_block, noop)
dblock = DataBlock(blocks=blocks, get_y=get_y, splitter=splitter)

#### Step 3: `DataLoaders`

In [None]:
# define our subword tokenized labeling strategy
labeling_strat = BILabelingStrategy(hf_tokenizer=hf_tokenizer, label_names=label_names)
tokenize_func = partial(
    tokenclass_tokenize_func, hf_tokenizer=hf_tokenizer, labeling_strategy=labeling_strat, words_attr="tokens", word_labels_attr="ner_tags"
)
proc_conll2003_ds = conll2003_ds.map(tokenize_func, batched=True)

Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-24f08770ab2d21c9.arrow


In [None]:
dls = dblock.dataloaders(proc_conll2003_ds, bs=4)

In [None]:
b = dls.one_batch()
print("# of items in each batch:", len(b))
print("# of inputs in each batch:", len(b[0]["input_ids"]))
print("# of targets in each batch:", len(b[1]))
print("Shape of input_ids (bsz, seq):", b[0]["input_ids"].shape)

# of items in each batch: 2
# of inputs in each batch: 4
# of targets in each batch: 4
Shape of input_ids (bsz, seq): torch.Size([4, 74])


Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)

{tuple: [dict, torch.Tensor]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,word / target label
0,"[('LONDON', 'B-LOC'), ('1996-08-26', 'O')]"
1,"[('It', 'O'), ('said', 'O'), ('another', 'O'), (',', 'O'), ('the', 'O'), ('Cometra', 'B-MISC'), ('et', 'I-MISC'), ('al', 'I-MISC'), ('Morinville', 'I-MISC'), ('11-13', 'I-MISC'), (',', 'O'), ('has', 'O'), ('logged', 'O'), ('128', 'O'), ('feet', 'O'), ('of', 'O'), ('productive', 'O'), ('Leduc', 'B-LOC'), ('Reef', 'I-LOC'), ('at', 'O'), ('5,400', 'O'), ('feet', 'O'), ('and', 'O'), ('is', 'O'), ('flowing', 'O'), ('water', 'O'), ('free', 'O'), ('at', 'O'), ('the', 'O'), ('rate', 'O'), ('of', 'O'), ('590', 'O'), ('barrels', 'O'), ('of', 'O'), ('oil', 'O'), ('per', 'O'), ('day', 'O'), ('on', 'O'), ('a', 'O'), ('15', 'O'), ('/', 'O'), ('64ths-inch', 'O'), ('choke', 'O'), ('.', 'O')]"


In [None]:
# |echo:false
try:
    del dls, hf_model
except:
    pass
finally:
    clean_memory()

### Batch-Time Tokenization

#### Step 1: HF objects.

In [None]:
# | output: false
hf_arch, hf_config, hf_tokenizer, hf_model = get_task_hf_objects("microsoft/deberta-v3-small", label_names, verbose=True)

=== config ===
# of labels:	9

=== tokenizer ===
Vocab size:		128000
Max # of tokens:	1000000000000000019884624838656
Attributes expected by model in forward pass:	['input_ids', 'token_type_ids', 'attention_mask']


####  Step 2: `DataBlock`

In [None]:
tokenize_tfm = TokenClassBatchTokenizeTransform(
    hf_arch, hf_config, hf_tokenizer, hf_model, labeling_strategy_cls=BILabelingStrategy, target_label_names=label_names
)

blocks = (TextBlock(tokenize_tfm=tokenize_tfm, input_return_type=TokenClassTextInput), TokenCategoryBlock(vocab=label_names))
dblock = DataBlock(
    blocks=blocks,
    get_x=ColReader("tokens"),
    get_y=ColReader("ner_tags"),
    splitter=ColSplitter(),
)

In [None]:
# |hide
# dblock.summary(conll2003_df)

#### Step 3: `DataLoaders`

In [None]:
dls = dblock.dataloaders(conll2003_df, bs=4)

In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, b[1].shape

(2, 4, torch.Size([4, 69]), torch.Size([4, 69]))

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)

{tuple: [dict, __main__.TokenTensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,word / target label
0,"[('""', 'O'), ('We', 'O'), ('have', 'O'), ('no', 'O'), ('doubt', 'O'), ('that', 'O'), ('this', 'O'), ('is', 'O'), ('one', 'O'), ('of', 'O'), ('the', 'O'), ('rarest', 'O'), ('of', 'O'), ('the', 'O'), ('rare', 'O'), ('cases', 'O'), (',', 'O'), ('not', 'O'), ('merely', 'O'), ('due', 'O'), ('to', 'O'), ('the', 'O'), ('number', 'O'), ('of', 'O'), ('innocent', 'O'), ('human', 'O'), ('beings', 'O'), ('roasted', 'O'), ('alive', 'O'), ('by', 'O'), ('the', 'O'), ('appellants', 'O'), (',', 'O'), ('but', 'O'), ('the', 'O'), ('inhuman', 'O'), ('manner', 'O'), ('in', 'O'), ('which', 'O'), ('they', 'O'), ('plotted', 'O'), ('the', 'O'), ('scheme', 'O'), ('and', 'O'), ('executed', 'O'), ('it', 'O'), (',', 'O'), ('""', 'O'), ('Justice', 'B-PER'), ('K.T.', 'I-PER'), ('Thomas', 'I-PER'), ('said', 'O'), ('in', 'O'), ('the', 'O'), ('verdict', 'O'), ('by', 'O'), ('a', 'O'), ('panel', 'O'), ('of', 'O'), ('three', 'O'), ('judges', 'O'), ('.', 'O')]"
1,"[('The', 'O'), ('credibility', 'O'), ('of', 'O'), ('the', 'O'), ('Buenos', 'B-LOC'), ('Aires', 'I-LOC'), ('provincial', 'O'), ('police', 'O'), (',', 'O'), ('the', 'O'), ('largest', 'O'), ('force', 'O'), ('in', 'O'), ('Argentina', 'B-LOC'), (',', 'O'), ('has', 'O'), ('been', 'O'), ('undermined', 'O'), ('this', 'O'), ('year', 'O'), ('by', 'O'), ('scandals', 'O'), ('that', 'O'), ('included', 'O'), ('the', 'O'), ('indictment', 'O'), ('of', 'O'), ('three', 'O'), ('officers', 'O'), ('for', 'O'), ('links', 'O'), ('to', 'O'), ('the', 'O'), ('1994', 'O'), ('bombing', 'O'), ('of', 'O'), ('a', 'O'), ('Jewish', 'B-MISC'), ('community', 'O'), ('centre', 'O'), ('and', 'O'), ('the', 'O'), ('arrest', 'O'), ('of', 'O'), ('an', 'O'), ('entire', 'O'), ('drugs', 'O'), ('squad', 'O'), ('for', 'O'), ('drug', 'O'), ('trafficking', 'O'), ('.', 'O')]"


In [None]:
# |echo:false
try:
    del dls, hf_model
except:
    pass
finally:
    clean_memory()

## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

## Export -

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()