In [None]:
# default_exp data.core


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import DataCollatorWithPadding, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset, concatenate_datasets
from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock
from fastai.data.external import untar_data, URLs
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")



What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.15.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Mid-level API: Base tokenization, batch transform, and DataBlock methods

In [None]:
# export
class HF_BaseInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    def show(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The "context" associated to the current `show_batch/results` call
        ctx=None,
        # Any truncation you want to apply to the decoded tokenized inputs
        trunc_at: int = None,
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label="text")


A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
# export
class HF_BeforeBatchTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as 
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")
        store_attr(self=self, names="max_length, padding, truncation, is_split_into_words, ignore_token_id, tok_kwargs")
        store_attr(self=self, names="kwargs")

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        if is_listy(samples[0][0]) and not self.is_split_into_words:
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items

        # tokenize
        tok_d = self.hf_tokenizer(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...), ensureing that if
        # "overflow_to_sample_mapping = True" we include each sample chunk
        d_keys = tok_d.keys()
        updated_samples = []
        if "overflow_to_sample_mapping" in d_keys:
            for idx, seq_idx in enumerate(tok_d["overflow_to_sample_mapping"]):
                s = (*[{k: tok_d[k][idx] for k in d_keys}], *samples[seq_idx][1:])
                updated_samples.append(s)
        else:
            updated_samples = [(*[{k: tok_d[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]

        if return_batch_encoding:
            return updated_samples, tok_d

        return updated_samples


`HF_BeforeBatchTransform` was inspired by this [article](https://docs.fast.ai/tutorial.transformers.html).

Inputs can come in as a string or a list of tokens, the later being for tasks like Named Entity Recognition (NER), where you want to predict the label of each token.

**On-the-fly Batch-Time Tokenization**: 

The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  

Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is *less code*, *faster mini-batch creation*, *less RAM utilization* and time spent tokenizing (really helps with very large datasets), and *more flexibility*.

In [None]:
# export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
    ):
        store_attr(self=self, names="hf_tokenizer, input_return_type")

    def decodes(
        self,
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples: Type,
    ):
        """Returns the proper object and data for show related fastai methods"""
        if isinstance(encoded_samples, dict):
            return self.input_return_type(encoded_samples["input_ids"], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples


With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here, `HF_AfterBatchTransform`, that will do the decoding for us.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if is_split_into_words:
        return len(example[0])
    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))


In [None]:
# export
@delegates(TfmdDL)
class OverflowDL(SortedDL):
    def __init__(self, dataset, sort_func=None, res=None, overflow_map_key="overflow_to_sample_mapping", **kwargs):
        super().__init__(dataset, sort_func=sort_func, res=res, **kwargs)
        self.overflow_map_key = overflow_map_key
        self.batch_items = None

    def create_batches(self, samps):
        if self.dataset is not None:
            self.it = iter(self.dataset)
        res = filter(lambda o: o is not None, map(self.do_item, samps))

        for b in map(self.do_batch, self.chunkify(res)):
            while self._n_batch_items() >= self.bs:
                yield self._get_batch()

    def do_batch(self, b):
        b = super().do_batch(b)
        self._add_batch(b)

    def _add_batch(self, b):
        if not self.batch_items:
            self.batch_items = b
        else:
            for i in range(len(b)):
                if isinstance(b[i], dict):
                    for k in self.batch_items[i].keys():
                        self.batch_items[i][k] = torch.cat([self.batch_items[i][k], b[i][k]])
                else:
                    self.batch_items[i].data = torch.cat([self.batch_items[i], b[i]])

        # update "n" to reflect the additional samples
        overflow_map = b[0][self.overflow_map_key].numpy()
        self.n += np.sum([i - 1 for i in Counter(overflow_map).values()])

    def _get_batch(self):
        chunked_batch = []

        for i in range(len(self.batch_items)):
            if isinstance(self.batch_items[i], dict):
                chunked_d = {}
                for k in self.batch_items[i].keys():
                    chunked_d[k] = self.batch_items[i][k][: self.bs]
                    self.batch_items[i][k] = self.batch_items[i][k][self.bs :]

                chunked_batch.append(chunked_d)
            else:
                chunked_batch.append(self.batch_items[i][: self.bs])
                self.batch_items[i].data = self.batch_items[i][self.bs :]

        return tuplify(chunked_batch)

    def _n_batch_items(self):
        return len(self.batch_items[0][self.overflow_map_key]) if self.batch_items else 0

    def _one_pass(self):
        self.do_batch([self.do_item(0)])
        b = self._get_batch()
        if self.device is not None:
            b = to_device(b, self.device)
        its = self.after_batch(b)
        self._n_inp = 1 if not isinstance(its, (list, tuple)) or len(its) == 1 else len(its) - 1
        self._types = explode_types(its)


In [None]:
# export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before batch transform you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm: HF_BeforeBatchTransform = None,
        # The batch_tfms to apply to the creation of your DataLoaders,
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm: HF_AfterBatchTransform = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: DataLoader = None,
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs: dict = {},
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None:
            raise ValueError(
                """You must supply the Hugging Face architecture, config, tokenizer, and model
                - or - an instances of HF_BeforeBatchTransform"""
            )

        if before_batch_tfm is None:
            # if allowing overflow, if we have to ensure mixed batch items are the same shape
            if "return_overflowing_tokens" in tok_kwargs:
                padding = "max_length"

            before_batch_tfm = HF_BeforeBatchTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **before_batch_kwargs.copy()
            )

        if after_batch_tfm is None:
            after_batch_tfm = HF_AfterBatchTransform(
                hf_tokenizer=before_batch_tfm.hf_tokenizer, input_return_type=input_return_type, **after_batch_kwargs.copy()
            )

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=before_batch_tfm.hf_tokenizer,
                is_split_into_words=before_batch_tfm.is_split_into_words,
                tok_kwargs=before_batch_tfm.tok_kwargs.copy(),
            )

            # `OverflowDL` is a `DataLoader` that knows how to serve batches of items that are created on the fly as a result
            # of asking the tokenizer to return an input in chunks if the lenght > max_length
            if "return_overflowing_tokens" in before_batch_tfm.tok_kwargs:
                dl_type = partial(OverflowDL, sort_func=dl_sort_func)
            else:
                partial(SortedDL, sort_func=dl_sort_func)

        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": before_batch_tfm}, batch_tfms=after_batch_tfm)



A basic wrapper that links defaults transforms for the data block API

`HF_TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face architecture and tokenizer via the `hf_arch` and `hf_tokenizer` (the other args are optional).

## Low-level API: For working with PyTorch and/or fast.ai Datasets & DataLoaders

Below is a low-level API for working with basic PyTorch/Hugging Face Datasets and DataLoaders that allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc...

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)

        store_attr(self=self, names="hf_arch, hf_config, hf_model, ignore_token_id")
        store_attr(self=self, names="tok_kwargs, text_gen_kwargs, is_split_into_words, kwargs")



In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator: BlurrBatchCreator = None,
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm: BlurrBatchTransform = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_tfm:
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory 
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility methods for getting blurr transforms

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = HF_BeforeBatchTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    dls: DataLoaders,  # Your fast.ai `DataLoaders
    before_batch_tfm_class: Transform = HF_BeforeBatchTransform,  # The before_batch transform to look for
    blurr_batch_tfm_class: Transform = BlurrBatchTransform,  # The after_batch (or batch_tfm) to look for
):
    """
    This convenience method will find the first Blurr transform required for methods such as 
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if tfm:
        return tfm

    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as 
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



## Base `show_batch` method

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x: HF_BaseInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs['labels'] if ('labels' in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.item()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Sequence classification

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API, and also an example on how to accomplish the same using the low-level API and standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Using the mid-level API

In [None]:
raw_datasets = load_dataset("imdb", split=['train', 'test'])
raw_datasets[0] = raw_datasets[0].add_column('is_valid', [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column('is_valid', [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()

Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


Unnamed: 0,label,text,is_valid
0,1,"While the original 1932 version, with Preston Foster, was good, there's no remake more worthy than this 1959 one, or more impossible to find anywhere, just as I strongly suspect Mickey Rooney to have had something to do with that. Never could a mere performance have ever been so masterfully brilliant, or a script more thought-provoking, as well as an improvement upon the original. Many years after the last of my several viewings of this film, in 1970, I read an article in which Mickey Rooney was recounting a visit he'd made to death row, and which had apparently very drastically eliminated...",False
1,1,"Ironically the most talked-about American film in the 2008 New York Film Festival is 98% in Spanish. The extra-long film's controversy began at the Cannes Festival. There were love-hate notices, and considerable doubts about commercial prospects. As consolation the star, Benicio Del Toro, got the Best Actor award there. I'm talking about Steven Soderbergh's 'Che,' of course. That's the name it's going by in this version, shown in New York as at Cannes in two 2-hour-plus segments without opening title or end credits. 'Che' is certainly appropriate since Ernesto ""Che"" Guevara is in almost ev...",False
2,0,"Bingo is the game, bullshit is the name. Rarely has the screen been smeared with such a blown-up hodgepodge of half-baked conspiracy theories, puritan prudery, and new-age gibberish. The bulk of the story is set at Viciente, a Cristian resort in the Peruvian jungle. Think Tolkien's Rivendell meets Star Trek's Planet Baku, inhabited by dimwitted followers of a not-so-mysterious, but surprisingly narrow-minded cult of love and peace. Thanks to gruesome acting and tacky production design (the rainbow-colored visualization of the mysterious all-healing ""energy"" is particularly hideous), ""The C...",False
3,1,"There are so many reasons as to why I rate the sopranos so highly, one of its biggest triumphs being the cast and character building. Each character unfolds more and more each series. Also each series has an array of different 'small time characters' as well as the main. A good example of a character (who was only in three episodes) who you can feel for is David the compulsive gambler played brilliantly by Robert Patrick. Every little detail builds the perfect TV series. The show revolves round mob boss Tony Soprano (James Gandolfini) who attempts to balance his life of crime with his role...",False
4,0,"I thought watching employment videos on corporate compliance was tedious. This movie went nowhere fast. What could have been a somewhat cheesy half hour twilight zone episode turned into a seemingly endless waste of film on people parking their cars, a picture of some dude's swimming pool (he really needs to answer his phone by the way) a dot matrix printer doing its job, and Heuy and Louey sitting in a yellow lighted control room repeating ""T minus 10 and counting"" as if something exciting is going to happen. It doesn't so don't get your hopes up. The best thing about this movie is to see...",False


In [None]:
labels = raw_datasets[0].features['label'].names
labels

['neg', 'pos']

#### Basic use

Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


Step 2: Create your `DataBlock`

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, before_batch_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 327]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Went to see this as Me and my Lady had little else to do on a sunday afternoon I like films that deal with sleazy,loser characters and this is full of em. After a slow start we get some good turns from the cast but it is the actual 'Bellini' that both makes and lets the film down. The 'Bellini' is one of the funniest scenes I have seen in a film for a long while but is too short and could have made this a masterpiece overall 71/2 out of 10",pos
1,"Much like the early horror film The Boogens, the devious unseen killer is quite a letdown when it finally becomes seen. Although Animal House's Stephen Furst obviously had fun in the role as a product of incest, his performance is more comedy than horror.<br /><br />The plot, an extremely tired one, has three sexy women(Bach, Lamm and Lois Young) unable to find a hotel for the evening, so they willingly accept to stay with a seemingly kind museum curator, exceptionally played by the deceased Sy",neg


#### Working with long documents

There are two options when dealing with texts longer than your model can handle.

First, as illustrated above, you can simply provide a `truncation` strategy to ensure they are <= the maximum length your model can handle.

Second, in the case we want to process the entirety of each document regardless of length, we can split text greater than the max length allowed by our model and then treat each of these chunks as separate examples. This approach is accomplished by setting `"return_overflowing_tokens": True` into our tokenizer function's via `tok_kwargs`. 

While the second approach is traditionaly performed as part of the data preprocessing, blurr can do this on-the-fly when using it's `OverflowDL` DataLoader (which is automatically used by blurr when you pass  `"return_overflowing_tokens": True` in the `tok_kwargs` argument of  `HF_TextBlock`.  Below is an example of how this works.

Step 1: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


Step 2: Create your `DataBlock`

In [None]:
blocks = (
    HF_TextBlock(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        max_length=128,
        tok_kwargs={"return_overflowing_tokens": True, "stride": 2},
        before_batch_kwargs={"labels": labels},
    ),
    CategoryBlock,
)

dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())

dls = dblock.dataloaders(imdb_df, bs=4)


Step 3: Build your `DataLoaders`

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)


Unnamed: 0,text,target
0,"'d become the professing Christian he now makes it a special point, whenever possible, to emphasize that he is; but, as anybody should be well-aware, this particular category of people tends to be the most vehemently out for blood, when it comes to extracting an eye for an eye. However, I have no particular bone of contention concerning that, per se, just as there's no doubt, scripturally speaking, that not all, and perhaps not even most, shall be spared the same ultimate fate, at the hands of the Lord Himself, as a result of His sacrifice on the cross. However, there is a",pos
1,"is a problem, for me, about the spirit or attitude with which most professing Christians emphasize their enthusiasm for capital punishment; for, contrary to the Lord Himself, who would love to see everybody saved (Ezekiel 18:32) (II Peter 3:9), they seem to go vindictively out of their way to find reasons to condemn!... What most people, on either side of this superlatively ever-burning issue, cannot appear to sufficiently appreciate, is that the Lord is as dynamically and elusively soft in nature as He is hard. The two sides of His nature appear to be",pos
2,"to be so inherently incompatible as to render Him mentally deranged, at least by any strictly human reckoning. Yet, regardless of how harrowingly ungraspable this miraculously dynamic blending of the water and oil in His nature surely is, there can be no doubt that anything short of it, or anything fanatically and characteristically on either one side or the other of this equation, falls inadequately and unacceptably short of the entire judicial truth. Indeed, I've seen the most blood-curdling thirst for the same come out, self-contradictorily enough, on far-too-many",pos
3,"-many occasions, whenever the categorically anti-death penalty advocates are confronted, even in the most rationally well-balanced ways, with the fact that, although the Lord died for everybody, not all are thereby going to be saved. After-all, in order to receive absolution, one must, to repeat the same term, reach out and receive it, that is, repent (Luke 13:3-5). Could anything make more sense?... But, then, what about the Lord's command to forgive, even in the case of one's enemies, of those who despise and persecute you",pos


### Using the low-level API

Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
raw_datasets["train"].features
raw_datasets["train"]


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Step 2: Define any pre-processing that needs to be done to your datasets (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={'labels': label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={'labels': label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 62])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,"IBM said it believes that the investigation arose from a separate SEC investigation of a customer of IBM's Retail Store Solutions unit, which markets and sells point-of-sale products. The company added : "" IBM believes that the investigation arises from a separate investigation by the SEC of a customer of IBM's Retail Store Solutions unit.",equivalent
1,"We have already found two trailers, both of which we believe were used for the manufacture of biological weapons. We have already found two trailers that we and the Americans believe were used for chemical and biological weapons. """,equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "hf-internal-testing/tiny-random-big_bird",
    "hf-internal-testing/tiny-random-bigbird_pegasus",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "hf-internal-testing/tiny-random-gptj",
    "hf-internal-testing/tiny-random-gpt_neo",
    "hf-internal-testing/tiny-random-hubert",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-layoutlm",
    "microsoft/layoutlmv2-base-uncased",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart", 
    "hf-internal-testing/tiny-random-mpnet",
    "anton-l/megatron-11b",                            
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "hf-internal-testing/tiny-random-reformer",
    "google/rembert",
    "hf-internal-testing/tiny-random-roformer",
    'google/reformer-enwik8',    
    "hf-internal-testing/tiny-random-roberta",
    "hf-internal-testing/tiny-random-squeezebert",
    "hf-internal-testing/tiny-random-transfo-xl",
    "hf-internal-testing/tiny-random-xlm",
    "hf-internal-testing/tiny-xlm-roberta",
    "hf-internal-testing/tiny-random-xlnet",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
raw_datasets = load_dataset("imdb", split=['train', 'test'])
raw_datasets[0] = raw_datasets[0].add_column('is_valid', [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column('is_valid', [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    try:

        hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

        print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

        # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
        if hf_tokenizer.pad_token is None:
            hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
            hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
            hf_model.resize_token_embeddings(len(hf_tokenizer))

        blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))



=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"""the last big thing"" is a wonderful satirical film that sardonically whips pop culture to the point of humorous self-desctruction. the characters are so interesting and fun to laugh at/sympathize with. which brings me to an introduction to the characters i liked best...br /br /simon geist is a man in his late 30s/early 40s who creates a pop-culture driven editorial magazine called ""the next big",1
1,"any film with a title as ridiculous as ""the bagman"" should automatically attract the attention of any bad movie lover, but the plot is far different than what one may expect after viewing the dvd cover. the bagman is by no means a good movie. it falls into the category of films that seem to have been (and probably were) filmed on a home video camera. the acting is awful. i haven't heard and seen such wooden acting since troll 2. there are plenty of scenes with nudity and sex",0


=== hf-internal-testing/tiny-random-bart ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This very good movie crackles with tension. The stakes are, of course, high--will the police and Widmark, the public health doctor, apprehend the criminals who don't know that they are carrying plague before a full-scale epidemic breaks out? But smaller plot elements introduce tension in every scene--between",1
1,"There is no story! The plot is hopeless! A filmed based on a car with a stuck accelerator, no brakes, and a stuck automatic transmission gear lever cannot be good! I would have stopped that car within one minute whether I was in it or in the police car constantly following it. I feel sorry for the actors that had to put up with such a poor scri",0


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i don't like using the word "" awful "" to describe any work of the cinema for which a great deal of time, effort, talent and money is spent in its creation but zefferelli's attempt to adapt charlotte bronte's novel'jane eyre'is a total waste of time. < br / > < br / > the script is lacking in finesse and power, everything explained to the viewer in no uncertain terms, leaving little to the imagination. the lead actors are woefully miscast, clearly hired for their star names, and the musical score drippy and dull. charlotte gainsbourg",0
1,"i'm not a writer or an critic... i'm just a student that has seen this movie few minutes ago.... and i want to thank people that worked on creating this movie! it is not the best or the most.... but it touched my heart... why??? i would like to understand it myself... it is easy and accessible.. it is a movie that makes you feel good after a bad day without any regret about the time wasted on watching it! it is about love and caring, about the life that we have but we miss it sometimes",1


=== hf-internal-testing/tiny-random-big_bird ===

architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"For months preceding the release of this movie you saw it advertised in all sorts of print media, so I patiently waited for its video release to see what all the hype was about. After it was over I had to apo",0
1,"I really did like this show, once upon a time. That is, until I realized all the faults in it. It's so unrealistic. I know it's fiction, but it isn't even the slightest bit believable. Here's why. **Spoil",0


=== hf-internal-testing/tiny-random-bigbird_pegasus ===

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"The plot of Corpse Grinders 2 is very much similar to the original Corpse Grinders, what is left that is different from the other film consists of",0
1,"Well I gave this movie a 7. It was better than ""Thirdspace"" but not as good as ""In the Beginning"" as far as the B5 movies go. I really t",1


=== hf-internal-testing/tiny-random-ctrl ===



  angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This 1919 to 1933 Germany looks hardly like a post WWII Czech capitol. Oh sorry, it is the Czech capitol and it is 2003, how funny.<br /><br />This is one of the most awful history movies in the nearest past. Röhm is a head higher than Adolf and looks so damned good, Göring looks like 40 when he just is 23 and the ""Führer"" always seems to look like 56. And the buildings, folks, even buildings have been young, sometimes. Especially 1919 were a lot of houses in Germany nearly new (the WWI does not reach German cities!). No crumbling plaster! Then the Reichstagsbuilding. There have never been urban canyons around",0
1,"Schlocky '70s horror films...ya gotta love 'em. In contrast to today's boring slasher flicks, these K-tel specials actually do something scary and do not resort to a tired formula.<br /><br />This is a B movie about the making of a B movie...that went horribly wrong. Faith Domergue (This Island Earth) stars as an over-the-hill, B movie queen making a movie about a series of grisly murders that befell a family in their home. Her boyfriend/director, who looks and acts like Gordon Jump with an attitude, is filming on location and on a tight schedule. The Ken doll co-star discovers a book of Tibetian chants",0


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"It is always satisfying when a detective wraps up a case and the criminal is brought to book. In this case the climax gives me even greater pleasure. To see the smug grin wiped off the face of Abigail Mitchell when she realises her victim has left ""deathbed testimony"" which leaves no doubt about her guilt is very satisfying.<br /><br />Please understand: while",1
1,"_Waterdance_ explores a wide variety of aspects of the life of the spinally injured artfully. From the petty torments of faulty fluorescent lights flashing overhead to sexuality, masculinity and depression, the experience of disability is laid open.<br /><br />The diversity of the central characters themselves underscores the complexity of the material examined - Joel, the writer, Raymond, the black man with a murky past, and",1


=== hf-internal-testing/tiny-random-canine ===



Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This is amazing-looking movie with the whole thing done in just six or seven colors. When it came out over 15 years ago, it st",1
1,"Sorry my fellow Nevada City neighbors, but this one is bad.<br /><br />Brian must have had too much botox because he had very",0


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"' stanley and iris'show the triumph of the human spirit. for stanley, it's the struggle to become literate and realize his potential. for iris, it's to find the courage to love again after becoming a widow. the beauty of the movie is the dance that robert deniro and jane fonda do together, starting and stopping, before each has the skills and courage to completely trust each other and move on. in that sense it very nicely gives us a good view of how life often is, thus being credible. unlike some other reviewers i found the characters each rendered to be consistent for the whole picture.",1
1,"dear friends and family, < br / > < br / > i guess if one teen wants to become biblical with another teen, then that's their eternal damnation - just remember kids, "" birth control "" doesn't mean "" oral sex "", i don't care what the honor student says. on the other hand, even if the senator's aid quotes himself as a "" bit of a romantic guy "", he's still only hitting on a high school girl. if she was my sister, i'd eat this guys kneecaps. < br / > < br / > other than that",0


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I can tell you just how bad this movie is. I was in the movie and I haven't seen it yet, but I cringe at the thought of anyone actually paying to see me drunk. Especially considering what we did that year. The thing is that they probably over edited it. Especially the scene where my roommate was snorting coke of the tits of a Mexican prostitute (they probably should have follow",1
1,The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war? Is AI a bad thing?,1


=== hf-internal-testing/tiny-random-deberta-v2 ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some of the background details of this story are based, very, very loosely, on real events of the era in which this was placed. The story combines some of the details of the famous Leopold and Loeb case along with a bit of Aimee Semple McPherson.<br /><br />The story begins with two mothers (Shelley Winters and Debbie Reynolds) being hounded as they leave a courtroom. The crowd seems most intent on doing them bodily harm as their sons were just convicted of a heinous thrill crime. One person in the crowd apparently slashes Winters' hand as they make their way to a waiting car.<br /><br />Soon",1
1,"Friz Freleng's 'Rumours' is an excellent Private Snafu cartoon that warns against spreading panic-inducing rumours during wartime. Produced, as were all the Snafu shorts, to be shown to military audiences as entertaining instructional films, 'Rumours' is extremely imaginative and crams tons of ideas into its very brief lifespan. When Snafu starts a rumour about a bombing, it escalates into an eventual rumour that America has lost the war. This is illustrated brilliantly by way of a long, rubbery piece of baloney and several strange, fictional creatures who come back to haunt Snafu with ever more terrible news about his",1


=== hf-internal-testing/tiny-random-distilbert ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"following the disasterous revolution, this film was pretty much the final nail in the coffin of goldcrest and thus the british film industry. the film",0
1,they changed the title of this atrocity to an unexpected love. the only thing worse is the film itself. the script contains dialogue that would be laugh,0


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"sandwiched in between san francisco and captains courageous two of spencer tracy's greatest parts is this very curious film about war and the effects it has on some people. they gave him a gun stars spencer tracy and franchot tone in the only film they ever made together and gladys george as the woman who loves them both. < br / > < br / > tracy and tone are a couple of world war i draftees, tone is",1
1,"in the trivia section for pet sematary, it mentions that george romero ( director of two stephen king stories, creepshow and the dark half ) was set to direct and then pulled out. one wonders what he would've brought to the film, as the director mary lambert, while not really a bad director, doesn't really bring that much imagination to this adaptation of king's novel, of which he wrote the screenplay. there are of course some very effective, g",1


=== google/fnet-base ===

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"As the jacket proclaims, this film is ""Gorgeously shot and masterfully edited,"" and, yes, it is mesmerizingly beautiful. The timelessness that we perceive in stoic rock and in the unceasing ebb and flow of water frames the ephemeral works from Goldsworthy's hands so that in their very ephemeralness they point to eternity.<br /><br />And so the beauty of his compositions haunt us with just a touch of melancholy woven in--or in the words of Matthew Ar",1
1,"The line is funnier in England, where, away from Vixen!'s native America, the word ""fanny"" has a whole new meaning. Sadly, it's the only laugh you'll get in this terrible sex comedy that is neither sexy nor funny.<br /><br />Oddly unalluring with painted-on eyebrows, Erica Gavin (Acting ability: zero) is a nymphomaniac who lusts after her own brother, but rejects his black friend while making derogatory remarks about watermel",0


=== hf-internal-testing/tiny-random-flaubert ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I am a HUGE Adam Sandler fan, and one day I was looking at the Cast & Crew selection on one of his DVD' s and saw'Going Overboard'and decided to go out and rent it. So I went out with a few buddies of mine and rented it. We put it on and we were shocked to see an Adam Sandler that didn' t hit puberty yet, he looks as if he was 12 when this movie came out. I couldn' t even watch 30 minutes of this crap, I did",0
1,"The word'classic'is thrown around too loosely nowadays, but this movie well deserves the appelation. The combination of Neil Simon, Walter Matthau ( possibly the world' s best living comic actor ), and the late lamented George Burns make for a comic masterpiece. It is interesting to contemplate what the movie would have been like had not death prevented Jack Benny from playing George Burns'part, as had been planned. As it is, the reunion scene",1


=== hf-internal-testing/tiny-random-funnel ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"what a great gem this is. it's an unusual story that is fun to watch. yes, it has singing, but it is very nicely crafted into the story and is very melodic to",1
1,what can i say about this movie? i have seen it quite a few times since the first time when i was around 6. i have seen the english version and it is done very w,1


=== hf-internal-testing/tiny-random-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"What's that there in the skies? Is it a plane? Is it Superman?? Errr, no It's a TURTLE!?! See, that's what becomes of the Cold War! Nothing but bad news and other issues! The Americans shoot down a Russian combat plane somewhere over Artic territory and the subsequent explosion defrosts & lit",0
1,"The minute the forward started, I knew we were in for trouble! The premise is laughable at best. The story line was even worse, if that is possible.<br /><br />The acting was stiff and the actors gave off a sense of inexperience. You expect more from the likes of Slater, Reid and Dorff. Lines were delivered as if from a ro",0


=== hf-internal-testing/tiny-random-gptj ===

=== hf-internal-testing/tiny-random-gpt_neo ===



Using pad_token, but it is not set yet.


architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Yep.. this is definatly up there with some of the worst of the MSTifyed movies, but I have definately seen worse. Think Gremlins rated R. Well anyway, I met Rick Sloane at some sci-fi convention, that amazingly, he was lecturing at! It was one of those really low budget conventions, where everything goes, an everyone b",0
1,"While this movie won't go down in the annals of great cinema, it is a fun way to spend an hour and a half with the family. The film is finally being released in video where it should have debuted in the first place.<br /><br />The film is about an eclectic group of friends who gather for dinners which they have named, ""The Hungry Bac",1


=== hf-internal-testing/tiny-random-hubert ===

=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"A true anomaly in the French cinema,this despairing work has no equivalent in the contemporary production.One would rather have to look on the side of Louis Malle's ""le feu follet"" (1963)(the fire within) to find something not completely unlike Harel's effort.Wry and cynical,having lost all his illusions,the hero,a computer scientist,has got no more reason to live.Absolutely none.Estranged from the human race,he seems to live his life as some kind of entomologist,studying his colleagues.One of them catches his attention:Tisserand",1
1,"""The Last Big Thing"" is a wonderful satirical film that sardonically whips pop culture to the point of humorous self-desctruction. The characters are so interesting and fun to laugh at/sympathize with. Which brings me to an introduction to the characters I liked best...<br /><br />Simon Geist is a man in his late 30s/early 40s who creates a pop-culture driven editorial magazine called ""The Next Big Thing"". Thing is, this magazine doesnt really exist, and it is only an excuse for Simon to get close to actors by interviewing them, only to bitch",1


=== hf-internal-testing/tiny-random-led ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,It seems that several of the people who have reviewed this movie only watched it in the first place because it was filmed near where they live. How's that for a ringing endorsement? If this movie was filmed near where I lived I wouldn't be mentioning it in my review. It is horrid! Several reviews state that this film is a spoof or tongue-in-cheek horr,0
1,"OK, I have watched the original French version. But I can't imagine this being better with subtitles.<br /><br />All I have to say that this is the most boring movie I have seen in a long time. There are almost no redeeming qualities to this film. That's why I can't understand all the positive reviews. It might be realistic in a s",0


=== hf-internal-testing/tiny-layoutlm ===

architecture:	layoutlm
tokenizer:	LayoutLMTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"what do you get if you cross the matrix with the truman show? < br / > < br / > i'm sure you've all seen the matrix by now. the creators of the matrix say that it is'anime inspired '. just from watching the trailer to this classic, you can see where they took the plot from. < br / > < br / > the film is sort of set in 1980s japan, and it really shows. the costumes, music and words (",1
1,ming the merciless does a little bardwork and a movie most foul!,0


=== microsoft/layoutlmv2-base-uncased ===

architecture:	layoutlmv2
tokenizer:	LayoutLMv2TokenizerFast

Could not do one pass in your dataloader, there is something wrong in it
=== hf-internal-testing/tiny-random-longformer ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Yes, as the other reviewers have already stated, this may not be vintage L&H but it's far from being their worst work as at 20th Century Stupid...I mean Fox. This film certainly has all of the basic ingredients for things to go wrong for the boys. But it's their serious approach and determination that makes them funny. They don't play",1
1,"Oh yes, Sakura Killers is a goofy, horrible ninja movie, make no mistake. But it's also an incredibly enjoyable one. This is largely thanks to the awesome presence of one Chuck Connors, who is billed as starring in the movie but really only shines in a few scenes. I suppose he's supposed to be sort of an Obi W",0


=== hf-internal-testing/tiny-random-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Where to begin, there's so much wrong and horrible about this movie I am not sure where to start. Okay, the two stooges who wrote this crap",0
1,"""Die Sieger"" was highly recommended to be one of the few good action movies made in Germany. I watched it last night and I must admit, t",0


=== hf-internal-testing/tiny-random-mpnet ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"on the surface, this movie would appear to deal with the psychological process called individuation, that is how to become a true self by embracing the",0
1,r balki tries to tell you a story that had been earlier told by ram gopal verma in nishabd in a sensuous way. this time it is mixed with mature humors. < br / >,0


=== anton-l/megatron-11b ===

=== hf-internal-testing/tiny-random-mobilebert ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,""" black angel "" is minor whodunit, with june vincent as a woman trying to save her husband from the electric chair after he is found guilty of killing an ol",0
1,an interesting change from the first one. there was more mystery to this movie then the first. even when it ends your asking yourself what happened who w,0


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the power to dream is a wonderful thing. there's a saying, "" not all dreamers achieve, but all achievers dream. "" by exploring our imagination we shape our own futures. or build empires. perhaps overcome our fears, limitations and obstacles. gain wisdom and benefit mankind. or ( put simply ) just find our way to true love and happiness. freud might express such things in symbols. the language of fantasy. < br / > < br / > tristan ventures out of a rather twee english village called wall. he goes through a break in the wall. a portal. in search of something that will prove his love",1
1,"on an overnight flight from los angeles to miami, lisa reisert ( rachel mcadams ) meets a charming man who turns out to be a hired killer who demands her help killing a businessman or else her own father will die. < br / > < br / > red eye is a terrific thriller that keeps the audience on the edge of their seats. the premise is similar to cellular and phone booth but red eye is better than both of those films. almost everything about red eye is above average including the suspense, the acting and the direction. most of the film does take place on a plane but that doesn't slow down",1


=== hf-internal-testing/tiny-random-reformer ===



Using pad_token, but it is not set yet.


architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"""If I wanted to dribble, I'd call a nurse."" <br /><br />""Haven't you had enough?""...""More than enough.""<br /><br />""You got me a c",1
1,"Being the first feature film with Robert De Niro (although not released for years later), this is worth the watch. De Niro's role isn't h",1


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I actually like the original, and this film has its ups and downs. Here's just a few:<br /><br />Ups: Most of the original voice cast returned.<br /><br />Downs: I didn't like the voice of Timon's Ma. I know she did a voice in The Simpsons, but that show is just plain stupid. <br /><br />Ups: We get to see Simba as a ""teenager.""<br /><br />Downs: They wasted it with a slug-slurping contest between Timon and Simba. <",1
1,"Oliver Gruner is totally unknown to me. My friend showed me this film because he had seen Gruner in, what he called a pretty good sci-fi film, Nemesis. So as we watched this, we found ourselves fastforwarding through the BS drama parts just to get to the unbelievable action sequences. Gruner loves to kick and kick and kick. And kick! haha<br /><br />Gruner character is a graduate student who is forced to stay in a ghetto close to the one that he grew up in. He finds himself watching after the boy who lives with him because he really wants to",0


=== hf-internal-testing/tiny-random-roformer ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"poorly - made "" blaxploitation "" crime - drama aimed squarely at the black urban market of the early 1970s. pam grier stars in the title role, that of a nurse who becomes a one - woman vigilante after drug - dealing thugs make coffy's little sister a junkie. violent nonsense plods along doggedly, with canned energy and excitement ; only grier's flaring temp",0
1,"here we go with other slasher movie, good looking people and acting from everyone was really good! < br / > < br / > few kids playing pranks phones calls and there parents are killed by the killer in front of the kids! 20 years after they are still friends, they go to huge house, have fun, drugs and sex ( no nudity ) for least half a hour of the movie! again they start making pranks phone call all over again! and",0


=== google/reformer-enwik8 ===

=== hf-internal-testing/tiny-random-roberta ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"The Twilight Zone has achieved a certain mythology about it--much like Star Trek. That's because there are many devoted lovers of the show that no matter what think every episode was a winner. They are the ones who score each individual show a 10 and cannot objectively evaluate the show. Because of this, a while back I reviewed all the original St",0
1,"I love this movie. It's wacky, funny, violent, surreal, played out in a madman's head, and definitely not your usual comedy. <br /><br />If you don't find the film amusing then I guess it's just not for your tastes, so this is a tough one to write a review for.<br /><br />",1


=== hf-internal-testing/tiny-random-squeezebert ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,storyline : max von sydow's voice - over narration hypnotizes the protagonist ( and audience ) back to 1945 where our protagonist the young american id,1
1,"after seeing jeremy brett as sherlock holmes, no actor should ever display such conceit as to imagine that he could ever come close to mr. brett's portr",1


=== hf-internal-testing/tiny-random-transfo-xl ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"As you probably already know, Jess Franco is one prolific guy. made hundreds upon hundreds of films, many of which are crap. However, he managed to sneak in an occasionally quality work amongst all the assembly line exploitation. """" isn't his best work (thats either ""The Diabolical Dr. Z"" or ""Lesbos""), but it has many of his trademarks that make it a must for anyone interested in diving into his large catalog. He combines the erotic (alternating between showing full-frontal nudity and leaving somethings left to the imagination) and the surreal seamlessly. This is a very dreamlike film, full of great atmosphere.",1
1,"I didn't have many expectation going into the film, but I thought it was fantastic. Pierce Brosnan is outstanding in a very different role. He has dumped the slick suits for a ridiculous look and pays off showing that he is an excellent actor. Pierce and Greg Kinnear play off each other great, and make for one of the better buddy pairings in a long time. The humor is dark, the performances by Brosnan, Kinnear, and Hope Davis are great, mix that with a touching element to the story about friendship, and you have a great film. This is probably one of the better buddy comedies in a long time. This is a",1


=== hf-internal-testing/tiny-random-xlm ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"this movie is very important because suggested me this consideration : sometimes you can wish to be sick... sometimes you can wish to have a syndrome... sometimes, for example, you can wish have goldfield syndrome... that way you 'd not remember this boring movie... and above all you 'd not remember adam "" superfluos "" sandler... sometimes, simply, you can wish... have rented another movie... < br / > < br / > my vote? 3 out of 10. my suggestion? if you are neither a fan of boring romantic comedies or adam sandler (... it's a joke don 't exist adam sandler's",0
1,"this is one of the better classic edgar wallace movies from the german series - it features all basics for a highly enjoyable wallace crime flic movie way back from the 60ies : although his majesty, mr. kinski, is missing you still have young joachim'blacky'fuchsberger, starring once again as the typical clever american'womanizer ', you have young eddi arendt in his best ( and just as well typical ) role ever - the cool, sophisticated british butler - and you have ( not so young anymore ) lowitz as the melancholic yet very'dry'ironic ( and thus",1


=== hf-internal-testing/tiny-xlm-roberta ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This is yet another bad movie that you should probably avoid watching. The plot could be a lot ""thicker"" than it actually is and would be better made as a blockbuster type movie.br />br />The acting leaves something to be desired, though you can not quite place your finger on what it is.br />br />This is",0
1,"A fine western, following the fate of those who possess the prize winning gun, a Winchester '73. It has a great cast who give superb cliche characterisations with help from the usual effective story telling direction from Mann.",1


=== hf-internal-testing/tiny-random-xlnet ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"The first 30min of the flick was choppy and hard to know just what was going on (unless you read the book - which I had not).<br /><br />If you can stick with the first half, the second half is sweet - predictable, y",0
1,"""The Invisible Mouse"" is a delightful and different Tom & Jerry's cartoon. It features the usual cat/mouse chases and battles, but in a different way this time. Jerry accidentally falls in a bottle of invisible in",1


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,hf-internal-testing/tiny-random-big_bird,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,hf-internal-testing/tiny-random-bigbird_pegasus,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
