In [None]:
# default_exp data.core


In [None]:
# all_slow



In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import DataCollatorWithPadding, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset
from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock
from fastai.data.external import untar_data, URLs
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")



What we're running with at the time this documentation was generated:
torch: 1.7.1
fastai: 2.5.3
transformers: 4.13.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")



Using GPU #1: GeForce GTX 1080 Ti


## Mid-level API: Base tokenization, batch transform, and DataBlock methods

In [None]:
# export
class HF_BaseInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    def show(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The "context" associated to the current `show_batch/results` call
        ctx=None,
        # Any truncation you want to apply to the decoded tokenized inputs
        trunc_at: int = None,
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label="text")



A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
# export
class HF_BeforeBatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets, as well as 
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")
        store_attr(self=self, names="max_length, padding, truncation, is_split_into_words, tok_kwargs")
        store_attr(self=self, names="kwargs")

    def encodes(self, samples, return_batch_encoding = False):  # A subset of data to put into a mini-batch
        """This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        if is_listy(samples[0][0]) and not self.is_split_into_words:
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items

        # tokenize
        tok_d = self.hf_tokenizer(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...), ensureing that if 
        # "overflow_to_sample_mapping" = True we include each sample chunk
        d_keys = tok_d.keys()
        updated_samples = []
        if ("overflow_to_sample_mapping" in d_keys):
            for idx, seq_idx in enumerate(tok_d["overflow_to_sample_mapping"]):
                s = (*[{k: tok_d[k][idx] for k in d_keys}], *samples[seq_idx][1:])
                updated_samples.append(s)
        else:
            updated_samples = [(*[{k: tok_d[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]

        if (return_batch_encoding):
            return updated_samples, tok_d
            
        return updated_samples


`HF_BeforeBatchTransform` was inspired by this [article](https://docs.fast.ai/tutorial.transformers.html).

Inputs can come in as a string or a list of tokens, the later being for tasks like Named Entity Recognition (NER), where you want to predict the label of each token.

**Notes re: on-the-fly batch-time tokenization**: The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is less code, faster mini-batch creation, less RAM utilization and time spent tokenizing (really helps with very large datasets), and more flexibility.

In [None]:
# export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
    ):
        store_attr(self=self, names="hf_tokenizer, input_return_type")

    def decodes(
        self,
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples: Type,
    ):
        """Returns the proper object and data for show related fastai methods"""
        if isinstance(encoded_samples, dict):
            return self.input_return_type(encoded_samples["input_ids"], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples


With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here, `HF_AfterBatchTransform`, that will do the decoding for us.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if is_split_into_words:
        return len(example[0])
    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))


In [None]:
#export
@delegates(TfmdDL)
class OverflowDL(SortedDL):
    def __init__(self, dataset, sort_func=None, res=None, overflow_map_key="overflow_to_sample_mapping", **kwargs):
        super().__init__(dataset, sort_func=sort_func, res=res, **kwargs)
        self.overflow_map_key = overflow_map_key
        self.batch_items = None

    def create_batches(self, samps):
        if self.dataset is not None:
            self.it = iter(self.dataset)
        res = filter(lambda o: o is not None, map(self.do_item, samps))

        for b in map(self.do_batch, self.chunkify(res)):
            while self._n_batch_items() >= self.bs:
                yield self._get_batch()

    def do_batch(self, b):
        b = super().do_batch(b)
        self._add_batch(b)

    def _add_batch(self, b):
        if not self.batch_items:
            self.batch_items = b
        else:
            for i in range(len(b)):
                if isinstance(b[i], dict):
                    for k in self.batch_items[i].keys():
                        self.batch_items[i][k] = torch.cat([self.batch_items[i][k], b[i][k]])
                else:
                    self.batch_items[i].data = torch.cat([self.batch_items[i], b[i]])

        # update "n" to reflect the additional samples
        overflow_map = b[0][self.overflow_map_key].numpy()
        self.n += np.sum([i - 1 for i in Counter(overflow_map).values()])

    def _get_batch(self):
        chunked_batch = []

        for i in range(len(self.batch_items)):
            if isinstance(self.batch_items[i], dict):
                chunked_d = {}
                for k in self.batch_items[i].keys():
                    chunked_d[k] = self.batch_items[i][k][: self.bs]
                    self.batch_items[i][k] = self.batch_items[i][k][self.bs :]

                chunked_batch.append(chunked_d)
            else:
                chunked_batch.append(self.batch_items[i][: self.bs])
                self.batch_items[i].data = self.batch_items[i][self.bs :]

        return tuplify(chunked_batch)

    def _n_batch_items(self):
        return len(self.batch_items[0][self.overflow_map_key]) if self.batch_items else 0

    def _one_pass(self):
        self.do_batch([self.do_item(0)])
        b = self._get_batch()
        if self.device is not None:
            b = to_device(b, self.device)
        its = self.after_batch(b)
        self._n_inp = 1 if not isinstance(its, (list, tuple)) or len(its) == 1 else len(its) - 1
        self._types = explode_types(its)


In [None]:
# export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The before batch transform you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm: HF_BeforeBatchTransform = None,
        # The batch_tfms to apply to the creation of your DataLoaders,
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm: HF_AfterBatchTransform = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: DataLoader = None,
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs: dict = {},
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None:
            raise ValueError(
                """You must supply the Hugging Face architecture, config, tokenizer, and model
                - or - an instances of HF_BeforeBatchTransform"""
            )

        if before_batch_tfm is None:
            # if allowing overflow, if we have to ensure mixed batch items are the same shape
            if ("return_overflowing_tokens" in tok_kwargs):
                padding = 'max_length'

            before_batch_tfm = HF_BeforeBatchTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **before_batch_kwargs.copy()
            )

        if after_batch_tfm is None:
            after_batch_tfm = HF_AfterBatchTransform(
                hf_tokenizer=before_batch_tfm.hf_tokenizer, input_return_type=input_return_type, **after_batch_kwargs.copy()
            )

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=before_batch_tfm.hf_tokenizer,
                is_split_into_words=before_batch_tfm.is_split_into_words,
                tok_kwargs=before_batch_tfm.tok_kwargs.copy(),
            )

            # `OverflowDL` is a `DataLoader` that knows how to serve batches of items that are created on the fly as a result
            # of asking the tokenizer to return an input in chunks if the lenght > max_length
            if ("return_overflowing_tokens" in before_batch_tfm.tok_kwargs):
              dl_type = partial(OverflowDL, sort_func=dl_sort_func)
            else: 
                partial(SortedDL, sort_func=dl_sort_func)


        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": before_batch_tfm}, batch_tfms=after_batch_tfm)



A basic wrapper that links defaults transforms for the data block API

`HF_TextBlock` has been dramatically simplified from it's predecessor. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face architecture and tokenizer via the `hf_arch` and `hf_tokenizer` (the other args are optional).

## Low-level API: For working with PyTorch and/or fast.ai Datasets & DataLoaders

Below is a low-level API for working with basic PyTorch Datasets (e.g., a dataset from the Hugging Face datasets library) and DataLoaders. Use the approach detailed below if you already have, or want to use, a plain ol' PyTorch `Dataset` instead of the fast.ai `DataBlock` API.

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)

        store_attr(self=self, names="hf_arch, hf_config, hf_model, tok_kwargs, text_gen_kwargs")
        store_attr(self=self, names="is_split_into_words, ignore_token_id, kwargs")



In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator: BlurrBatchCreator = None,
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm: BlurrBatchTransform = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # (optional) list of corresponding labels names for classes; if included then methods like `show_batch` will
        # show the name corresponding to the label index vs. just the integer index.
        label_names: Optional[list] = None,
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_tfm:
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model, label_names")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory 
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility methods for getting blurr transforms

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = HF_BeforeBatchTransform,
):
    """Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    dls: DataLoaders,  # Your fast.ai `DataLoaders
    before_batch_tfm_class: Transform = HF_BeforeBatchTransform,  # The before_batch transform to look for
    blurr_batch_tfm_class: Transform = BlurrBatchTransform,  # The after_batch (or batch_tfm) to look for
):
    """This convenience method will find the first Blurr transform required for methods such as 
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if tfm:
        return tfm

    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as 
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



## Base `show_batch` method

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x: HF_BaseInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    trg_labels = None
    if hasattr(dataloaders, "label_names"):
        trg_labels = dataloaders.label_names

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.item()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Sequence classification

Below demonstrates both how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input) using the mid-level API, and also with the low-level API should you wish to work with standard PyTorch or fast.ai Datasets and DataLoaders

### Using the mid-level API

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path("models")
imdb_df = pd.read_csv(path / "texts.csv")


In [None]:
imdb_df.head()


Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


#### Basic use

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"This film is about a family trying to come to terms with the death of the mother/wife by moving to Genova, Italy.<br /><br />The plot of ""Genova"" sounds promising, but unfortunately it is empty and without focus. The film only consists of a collection of scenes depicting the daily life of the family, such as swimming, taking piano lessons or cooking eggs. Most of such scenes are redundant and tiresome, completely failing to engage viewers emotionally. The ending is very disappointing as it is n",negative
1,"Recently, I had opportunity to view a working print in Kansas City (Olathe, KS.) of this title. It is difficult for me, being a lover of the art as I am, to report the following, but, the truth sometimes hurts, and quite frankly after sitting through this tripe (I'm using the slang definition here - worthless statements or writing) for an hour and a half, I feel obligated to share (WARN) any interested parties. Let's begin at the beginning, a good place to start as always. The first 15 minutes",negative


#### Working with long documents

There are two options when dealing with texts longer than your model can handle.

First, as illustrated above, you can simply provide a `truncation` strategy to ensure they are <= the maximum length your model can handle.

Second, in the case we want to process the entirety of each document regardless of length, we can split text greater than the max length allowed by our model and then treat each of these chunks as separate examples. This approach is accomplished by setting `"return_overflowing_tokens": True` into our tokenizer function's via `tok_kwargs`. 

While the second approach is traditionaly performed as part of the data preprocessing, blurr can do this on-the-fly when using it's `OverflowDL` DataLoader (which is automatically used by blurr when you pass  `"return_overflowing_tokens": True` in the `tok_kwargs` argument of  `HF_TextBlock`.  Below is an example of how this works.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

In [None]:
blocks = (
    HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=128, tok_kwargs={"return_overflowing_tokens": True, "stride": 2}),
    CategoryBlock,
)

dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())

dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)

Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"yllic storyline would make the film critic proof. He was right, but it didn't fool me. Raising Victor Vargas is the story about a seventeen-year old boy called, you guessed it, Victor Vargas (Victor Rasuk) who lives his teenage years chasing more skirt than the Rolling Stones could do in all the years they've toured. The movie starts off in `Ugly Fat' Donna's bedroom where Victor is sure to seduce her, but a cry from outside disrupts his plans when his best-friend Harold (Kevin Rivera) comes-a-looking for him. Caught in the",negative
2,"in the attempt by Harold and his sister, Victor Vargas runs off for damage control. Yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young Victor from going off on the hunt for more fresh meat. On a hot, New York City day they make way to the local public swimming pool where Victor's eyes catch a glimpse of the lovely young nymph Judy (Judy Marte), who's not just pretty, but a strong and independent too. The relationship that develops between Victor and Judy becomes the focus of the film. The story also focuses on",negative
3,"focuses on Victor's family that is comprised of his grandmother or abuelita (Altagracia Guzman), his brother Nino (also played by real life brother to Victor, Silvestre Rasuk) and his sister Vicky (Krystal Rodriguez). The action follows Victor between scenes with Judy and scenes with his family. Victor tries to cope with being an oversexed pimp-daddy, his feelings for Judy and his grandmother's conservative Catholic upbringing.<br /><br />The problems that arise from Raising Victor Vargas are a few, but glaring errors. Throughout the film you get to know",negative


### Using the low-level API

Step 1: Grab your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
raw_datasets["train"].features
raw_datasets["train"]


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Step 2: Define any pre-processing that needs to be done to your datasets (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


Step 3: Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets.

Setting the `label_names` argument to a list of label names corresponding to each class's index will ensure the methods like `show_batch` and `show_results` print the name of the class rather than just its index.

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    label_names=label_names,
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    label_names=label_names,
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape



torch.Size([8, 65])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"They include Ask Jeeves Inc., Global Crossing, Aether Systems, Clarent, Copper Mountain Networks and VA Linux, now VA Software. They included Global Crossing, Akamai Technologies, Ask Jeeves, Copper Mountain Networks, Etoys and VA Linux.",equivalent
1,"Fanned by the hot, dry Santa Ana winds and minimal humidity, major fires were raging in at least 10 places, having already burned nearly 80 937 hectares. Those hot, dry Santa Ana winds and minimal humidity created optimal conditions for raging fires in at least 10 places that have already burned nearly 200,000 acres.",equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "albert-base-v1",
    "facebook/bart-base",
    "bert-base-uncased",
    "google/bigbird-roberta-base",
    "sshleifer/tiny-ctrl",
    "camembert-base",
    "sarnikowski/convbert-medium-small-da-cased",
    "microsoft/deberta-base",
    "microsoft/deberta-v2-xlarge",
    "distilbert-base-uncased",
    "monologg/electra-small-finetuned-imdb",
    "flaubert/flaubert_small_cased",
    "huggingface/funnel-small-base",
    "gpt2",
    "kssteven/ibert-roberta-base",
    "allenai/led-base-16384",
    "microsoft/layoutlm-base-uncased",
    "allenai/longformer-base-4096",
    "sshleifer/tiny-mbart",
    "microsoft/mpnet-base",
    "google/mobilebert-uncased",
    "openai-gpt",
    #'reformer-enwik8',                  # (see model card; does not work with/require a tokenizer so no bueno here)
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    #'google/tapas-base',                # (requires pip install torch-scatter)
    "transfo-xl-wt103",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path("models")
imdb_df = pd.read_csv(path / "texts.csv")



In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

    dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
    dls = dblock.dataloaders(imdb_df, bs=bsz)
    b = dls.one_batch()

    try:
        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))



=== albert-base-v1 ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i was pleasantly surprised i quite liked this movie. witty writing (some ""inside"" jokes i got, others i didn't - maybe due to actors speaking on top of one another), great acting (notably john cassini), great cameos, interesting and unique directing. i rented it to see jeffrey meek (very disappointed he was in it such a short time, blink and you'll miss him!) but found the movie remarkably entertaining. i'll actually watch it again before i send back to netflix. i think actors and wanna-be actors will thoroughly enjoy this",positive
1,"after mob boss vic moretti (late great anthony franciosa) kills his lady whom has been cheating on him with derek, their new chauffeur/ vietnam vet, and blames it on the poor guy, derek finds himself in jail where he has to contend with a corrupt warden, vic's prisoner brother who runs the jail, and, oh yeah illegal experiments conducted by a shady cia agent (great genre-mainstay and first time director john saxon) to turn various prisoners into super-human invincible zombies. of course things get out of hand and it's up to derek, and the",negative


=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE DEVIL'S PLAYTHING is my second attempt at a Joseph Sarno production - and although I will say it is far more enjoyable than the painfully dull and unerotic Swedish WILDCATS, it is still a little slow and un-explicit for my taste.<br /><br />This one centers around a group of vampire girls who live in a castle, that want to resurrect their previously murdered ""leader"". In order to do so, the girls have to dance around naked and kiss each other and chant weird stuff - and of course drink some blood, too. When a doctor and her brother",positive
1,"I'm not a big fan of most anime, but Gundam Wing is truly something else. Gundam wing lacks all of that stereotypical melodrama that you might think of when you think of anime, since the number of jokes made over the 17 hours would only be in the double digits, Gundam Wing gets right down to business. <br /><br />Gundam Wing is as much of a political thriller as it is an action series. Large parts focus on the diplomatic dealings of a war, not only the battles. Though battle animation lacks extreme detail in cases where it would just be a pain to animate, individual duels",positive


=== bert-base-uncased ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the views of earth that are claimed in this film to have been faked by nasa have recently been compared with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the actual rainfall records on the day. < br / > < br / > this would seem to undermine the entire argument put forward in the film that the "" whole earth "" picture is actually a small part of the planet framed by the spacecraft window. < br / > < br / > i am waiting for bart sibrel to now claim that the historical weather data has been faked by",negative
1,"why, o'why!... did i pick this one up? well... i needed a no - brainer in the summer heat, and the cover looked cool. < br / > < br / > of course i should've known better. this is a really, really bad movie. and it gets embarasing when the makers know it's bad, and try cover it up by adding some sexy / beautiful women, and some sex - scenes to it. well, folks... it does'nt cut it, does it! < br / > < br / > if you",negative


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Pixar has had massive success over the years with the full-length CGI animated movies they have made. ""A Bug's Life"" was the second of a whole bunch of features they have made so far, preceded by the company's feature-length debut, the groundbreaking ""Toy Story"", which was the first ever feature-length CGI movie. I remember when this follow-up was heavily advertised around the time of its release in the late 1990's, but I never actually saw it until November 2006. I watched it twice that month, and over three years later, I have seen it a third time. It has never impressed me",positive
1,"Written by the writer who penned the excellent Murder Rooms series which chronicled ACD's adventures with Doctor Joseph Bell, I was looking forward to this and I wasn't disappointed. It was quite slow moving, with a lot of emphasis on Doyle's frustration at Sherlock Holmes which was very accurate and excellently portrayed. It was an interesting character study and very well shot ( on digital video, unusual for a period piece ). The acting was excellent all round, particularly Tim McInnery and Brian Cox although the actor who portrayed ACD, whose name I cannot remember impressed me no end. An excellent character study which has about",positive


=== sshleifer/tiny-ctrl ===



Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This film, along with WESTFRONT 1918, are my favorite Pabst-directed films and I enjoyed them more than his much more famous films which starred Louise Brooks (such as PANDORA'S BOX). It's probably because both are very similar to the Neo-Realist films that the Italians perfected in the 1940s and 50s. This style film called for using non-actors (just typical folks) in everyday settings in order to create intensely involving and realistic films.<br /><br />In this case, the film is about French and German coal miners, so appropriately, the people in the roles seem like miners--not actors. The central conflict as the film begins is that there is",positive
1,Ask yourself where she got the gun? Remember what she was taught about the mark's mindset when the con is over? The gun had blanks and it was provided to her from the very beginning.<br /><br />When the patient comes back at the end she was SUPPOSED to see him drive away in the red convertible and lead her to the gang splitting up her 80 thousand.<br /><br />The patient was in on the con from the beginning.<br /><br />Mantegna does not die in the end - the gun had blanks.<br /><br,positive


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Ah, a Kelly/Sinatra sailor-suit musical. So familiar, right? Yes, but this isn't the one you usually hear about. On The Town's that-a-way. But if you stick around, you might learn something. Okay, probably not. Anyway, Anchors Aweigh tells the story of two sailors on a three- or four-day leave. Joe is the ""Sea Wolf"" and Clarence,",positive
1,"actually... that ""video camera"" effect, is just that, it's an effect, a rather good one.. (u don't know much about directing a film do you?) this film is in fact BETTER than the original, it's great fun to watch, made for TV, doesn't need to follow any rules. I find it hard to watch number 1 because of how he kills the first girl, its disturbing. and all the time we are",positive


=== sarnikowski/convbert-medium-small-da-cased ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"... and how they bore you right out of your mind! The Crater Lake Monster is one of the classic BAD films from the 70's made with no actors of any note, an embarrassing script, woeful direction, and a tireless desire to fuse "" horror "" with light comedy. This movie introduces a paleontologist who finds drawings of an aquatic dinosaur underneath Crater Lake... a meteor falls from the sky, and",negative
1,Ko to tamo peva is the best comedy of all times. Believe me i saw a lot of movies and comedies but tell me which one make you smile every time you watching it. But truth is that the humour in this comedy is special. It is caratherisic for serbia. And all former republic of yugoslavia know it very well!!! So i think the rest of audience ( for example : In Europe ) can't enjoy it so much. Be,positive


=== microsoft/deberta-base ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"A wonder. One of the best musicals ever. The three Busby Berkely numbers that end the movie are spectacular, but what makes this film so wonderful is the incredible non-stop patter and the natural acting of Cagney and Blondell. (Keeler is also lovely, even though she may not have been a great actress). There's a freshness in the movie that you don't see in flicks today, much less in the usually stilted 30s films, even though the plot, involving the setting up of movies prologues, is quite dated.",positive
1,"This is a really interesting movie. It is an action movie with comedy mixed in. Foxx teams up with comedian Epps in this movie to give it a comedic spin. It will keep you wondering whats going to happen to Foxx next. It was a well shot movie, the director used the right colors in this movie(dark blue colors) to give it the right kind of feel. Kimberly Elise also starred in this movie and it is always a pleasure to see her on the big screen. She plays her role well. Even Jamie Kennedy is in this movie. It's worth seeing it you haven't seen it. It",positive


=== microsoft/deberta-v2-xlarge ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Carnosaur 3 is bad... awfully bad. Bad to the point where it is funny. How matter how much I try to convince myself, I just can't believe anyone in this world could find this entertaining for serious reasons. I mean, come on, even the cover is bad! OK, the special effects are absolutely ridiculous. Those ""Carnosaurs"" are really ridiculous. A scientist tells the soldiers that they move incredibly fast, yet when you see them run, they run at the speed of... an actor in a rubber suit trying to run as much as he can. And the explosions are funny(there is",negative
1,"This was the first Ewan McGregor movie I ever saw outside of Star Wars. Since then I have become a very big Ewan McGregor fan but I still can't bring myself to forgive this movie's existence.<br /><br />My sister has always been a huge Jane Austen fan and because of that, I have been subjected to various of the classics, Emma being one of them. I've always considered them irritating, stupid and boring. However, after watching this terrible rendition, I was forced to admit that the original Emma was delightful and charming. Ewan McGregor scarcely serves a purpose in this film after they hacked and mutilated",negative


=== distilbert-base-uncased ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i remember watching this on prime time when i was about 7 years old. i was a huge comic book reader at the time, and anything relating to superheroes was anticipated heavily. the end result, however, was underwhelming. < br / > < br / > i was aware of the "" emma peel "" diana prince stories, as they had only recently come to an end and diana was returned to her amazonian form. however, there was so little action that i was bored throughout most of the movie. the final costume was an interesting idea, but looked more like a cheerleader than a superhero. < br",negative
1,"having read the other comments on this film, i would like to share my own view that this is one tough movie to see unless you are a total brooksophile. i am not. < br / > < br / > when looked at by a purely objective observer, the film is an unbalanced narrative that presents us with more undistilled neuroses than are capable of being absorbed in one sitting. it is quite difficult to watch. the brooks character ( robert cole ) is so unsympathetic and unpleasant that it is hard to relate to him - - - let alone root for him as",negative


=== monologg/electra-small-finetuned-imdb ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the case of the scorpion's tail is a highly stylish giallo directed by sergio martino, who appears to be a giallo master second only to dario argento. < br / > < br / > ernesto gastaldi wrote this fabulous who - dunnit, quite complex but ultimately very satisfying and entertaining murder mystery. it also makes sense in the end, a big plus,'cause that's not always the case for these giallo's, as they tend to stretch credibility with their endless red - herrings and ultimate solutions. here, the less you know about the plot, the better.",positive
1,"the interplay between the characters is a moral disaster. you end up disliking most of the characters and you don't particularly like any of them. < br / > < br / > even the two main characters played by david and gwen are so badly written that you really don't care one bit about them. the movie has no plot, no direction and no purpose. the single redeeming quality of the movie was to treat it as a glimpse into the messed up lives of a few losers - and that's hardly stimulating even as an afternoon waste.",negative


=== flaubert/flaubert_small_cased ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas : A Review < br / > < br / > You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It' s warm and gooey, but you' re not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn' t quite feel right. Victor Vargas suffers",negative
1,""" Look, I know this may suck right now, but pain is temporary, film is forever. Whatever you do right now is burned into celluloid for all time and for thousands of years to come "". Robert De Niro < br / > < br / > This was initially a film for Steven Spielberg, the director hiring several screenwriters to adjust the screenplay so that it more suited his themes. And so we have a dysfunctional family that is threatened by a",negative


=== huggingface/funnel-small-base ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"users who have rated this movie so highly simply can't have seen enough good films to compare it with. have they all been brainwashed?? i have rarely felt so disappointed by a film and some of that must be attributable to the ridiculous hype surrounding this movie. < br / > < br / > from the first, bu is just a chase film. we pick it up at the end of one chase and go straight into another. and another. and another. and another. do you see a pattern emerging? there is virtually no time'wasted'on plot, character development, or boring old",negative
1,"really good horror flick featuring to of the greatest, boris karloff and bela lugosi. dr. janos rukh ( karloff ) is on an expedition in africa trying to find an ancient meteorite. after finding it, rukh is poisoned by the its radiation. all he touches dies and the dark side of rukh makes him become an egotistic murderer. his friend, dr. felix benet ( lugosi ) finds a limited remedy to the problem and at the same time realizes the radiation could be used for the good of mankind by curing diseases. the two fiends will battle over the",positive


=== gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"The real story (took place in Kansas in 1959) of a murder (Perry and Dick, two ex-convicts who broke into a remote house on a rainy night to steal and kill everyone they met). Richard Brooks directed the chilling and disturbing Capote's book about the reasons that drove these kids to the crime (Are they Natural Born Killers?). The crime scenes are very brutal and haunting because of the lack of senses and reasons for what we witnessed. Stunning black & white cinematography from Conrand Hall, excellent country - road music score from Quincy Jones, amazing performances in two principal roles from Robert Blake and Scott",positive
1,Dig! I would say to anyone even if you don't like Metallica to see'some kind of monster' it is a spinal tap type documentary about one of the biggest bands in the world acting like mental kids during a breakdown of sorts. It's fun and fascinating. Along the same lines comes dig! A film about 'the Dandy Warhol's' and 'the Brian Jonestown massacre' two Portland bands who start off a kind of music scene in there home town only for one of the bands to become huge and one to fall by the wayside into the musical history books. Right from the start the two bands pull,positive


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I should explain why i gave this...""piece of art"" 1 star rating out of possible 10. Simply because it's hard or next to impossible to rate it unbiased. probably it would have been the same if i had given it 10/10 - explanations anyway would have followed.<br /><br />I am not fond of these pointless gore movies like HOSTEL or so - i think that's disgusting and pretty terrible (in all the possible contextual meanings), but as i found out after watching this movie - there is a genre called ""historical drama"" - and probably it would have been the case of 10/10",negative
1,"This horrendously bad piece of trash manages to be racist, sexist and homophobic all at once, while pretending to be terribly chic and sophisticated. Atrocious performances, a cliche ridden screenplay, and boring direction make this movie one to steer clear of. Two scenes were especially offensive - the one in which Schaech scrubs his tongue after being kissed by another man (could it really have been that gross), and the scene where Eastwood is kissed by Schaech's best friend, who is pretending to be Russian. After he leaves the room she exclaims ""f**king foreigners""! So much for her being a cultured artist",negative


=== allenai/led-base-16384 ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"In Hazzard County, Georgia, cousins Bo and Luke Duke (Scott, Knoxville) and their cousin Daisy Duke (Jessica Simpson) run moonshine made by their Uncle Jesse (Willie Nelson) while avoiding the local authority, Boss Hog (Burt Reynolds). Their problems with the Boss are only beginning as they learn he's been plotting to strip mine the town for valuable ores found below it.<br /><br />I have never seen the TV show and after watching the movie, I'm not going to start any time soon. I like stupid comedies but this one didn't offer many laughs. It",negative
1,A very realistic portrait of a broken family and the effect it has on the kid caught in between. As a child of divorced parents I was totally relating to events in the film. Also - a really cool zombie twist which I thought was VERY ORIGINAL. I'm tired of the same old stuff in movies. A very realistic portrait of a broken family and the effect it has on the kid caught in between. As a child of divorced parents I was totally relating to events in the film. Also - a really cool zombie twist which I thought was VERY ORIGINAL. I'm tired of the same old stuff in movies. A very,positive


=== microsoft/layoutlm-base-uncased ===

architecture:	layoutlm
tokenizer:	LayoutLMTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"in theory, films should be a form of entertainment. while this excludes documentaries and other experimental forms of film - making ; most movies, specially genre films, must not only tell it's story or message, they must entertain their target audience in some way. all this just to say that in my opinion a bad movie is not a movie with low production values or low - budget, a bad movie is one that is boring. < br / > < br / > "" hellborn "" or "" asylum of the damned "" as is known in the u. s., is a bad movie simply because it is just not",negative
1,"the interplay between the characters is a moral disaster. you end up disliking most of the characters and you don't particularly like any of them. < br / > < br / > even the two main characters played by david and gwen are so badly written that you really don't care one bit about them. the movie has no plot, no direction and no purpose. the single redeeming quality of the movie was to treat it as a glimpse into the messed up lives of a few losers - and that's hardly stimulating even as an afternoon waste.",negative


=== allenai/longformer-base-4096 ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Poor Will would be rolling over in his grave if he could this this horiible German-TV adaptaion of his classic play. It's obvious that very little money was spent on it. A stage riser, a catwalk and some randomly placed columns pass off as a set. The movie was ineptly dubbed into English, with the English voice actors occasionally mumbling their lines. The whole production had an incredibly dark and dreary feel to it. And just where was Fonterbras in this movie anyway? MST3K gave this sorry production the treatment it justly deserved.<br /><",negative
1,"Peeew this stinks! As everyone knows it's based upon some Geico insurance commercials; what no one knows is WHY?! Those commercials were amusing on first viewing at best; hardly fodder for a series. (The talking Geico gecko -- that's another story. Now that would make for an intriguing series!) And why on earth did ABC -- as reported in the press -- actually agree to buy the cavemen character rights from Geico for this? After all, the idea of cavemen struggling in the modern world is hardly unique to TV; Phil Hartman had a recurring Saturday Night Live role as The Unf",negative


=== sshleifer/tiny-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Jungle Fever is too highly stylized, stereotyped, and comes across as essentially dishonest. Wesley Snipes was wrong for the lead and there was no chemistry between him and Annabella Sciorra. Even though there's plenty of talent in this movie, it's mostly wasted because the parts are reduced to little more than decorative cameos. Also, instead of simply showing racism for the ugly and stupid thing it is, Spike Lee chooses to wave it around like a flag in a most whining and irritating manner. I made",negative
1,"I should have known I was in trouble with Casper Van Diem as the lead character. Words cannot describe, nor do they do justice to just how terrible this movie was. But please allow me to try to describe it: Horrible acting, terrible dialog, corny situations and through it all you get the feeling that you are being force-fed the beliefs and propeganda from the Trinity Broadcasting Network. Its a weak attempt at trying to show Hollywood that a movie can be entertaining and have a deep, religious message attached to it. They failed miserably. It was clearly the",negative


=== microsoft/mpnet-base ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"as a fan of science - fiction movies, i have been aware of the matrix since its release in 1999. from the little bit i would allow people to tell me about it, i assumed it was highly original and sophisticated. i am also a devotee of alice in wonderland. i could never quite figure out how i missed the matrix when it was released. with the imminent release of the matrix reloaded, it was time to buy the dvd and watch it. < br / > < br / > the disappointment was too great. the premise of the matrix ( the controlling device as opposed to the movie ) was clever. the",negative
1,"road to perdition can be summed up by thomas newman's score. it's haunting and beautiful but you're aware that this music is similar to newman's other work and while listening to the soundtrack you're reminded of scent of a woman, meeting joe black and the shawshank redemption you're reminded of other films as the story unfolds on screen. as the sullivans drive round america trying to escape from a psychotic hit man you think of the getaway, irish gangsters is miller's crossing whilst the subtext of guilt and redemption can be summed up by coppola",positive


=== google/mobilebert-uncased ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i found myself very caught up in this movie, at least at the beginning, and any credit i give to this movie, is lacey chabert, she was fantastic!! but thats where it ends. i seem to be very good at figuring out who the killer is, and i like it when a movie is able to completely baffel me, but i felt out and out lied to, they whole time they lead you in one direction and then suddenly they decided to go in a completely different direction at the end, they gave no hit to it at all, thats not misleading that very bad writing and planning,",negative
1,i thoroughly enjoyed this film for its humor and pathos. i especially like the way the characters welcomed gina's various suitors. with friends ( and family ) like these anyone would feel nurtured and loved. i found the writing witty and natural and the actors made the material come alive.,positive


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,i have seen this movie and the other one. trinity is my name and i find that this one is worse then the first one. i have no idea why they even made another movie it was stupid and pointless sorry to say that i have all of them. i have sat through them number of times and it still drives me to turn it off 5 minutes into the movie. i like terence hill movies and i like bud spencer but this movie just drove me up the wall. if it had a different story line or at least more of a plot and more comedy it might have been funner and worth the 5 dollars i spent buying all,negative
1,"please, be warned : this movie, though a pretty bad storyline, was one of the most gruesome movies i have seen... ever. just remember that before you settle on your sofa to enjoy the movie. < br / > < br / > so, it officially begins with a party. just your average party but there's some guy there. he's pretty into kate... if you know what i'm saying. memorise his face ; it'll help later. < br / > < br / > so anyway kate goes of to find george clooney ( didn't i say the plot",negative


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Another turgid action/adventure flick from the Quinn Martin Productions factory. Roy Thinnes plays undercover agent Diamond Head (Mr. Head, to you), working for his G-Man handler ""Aunt Mary"", looking for ""Tree"", who's on a mission to...well, just watch the movie. <br /><br />This one deserved and got the full MST3K sendup. As the boys and various reviewers have pointed out, the movie ""Fargo"" had more Hawaiian locations than this film. Apparently shot on a puny budget, this movie highlights Hawaii's broken-down dive shops,",negative
1,"<br /><br />I have seen this movie many times. At least a Dozen. But unfortunatly not recently. However, Etched in my memory never to leave me is a scene in which Mickey Rooney, -""Killer Mears"" knows that he is to be executed and it's getting close to the moment of truth, He dances, and cries, and laughs, he vacillates from hesteria to euphoria and runs the gambit of ever emotion. Never have I seen such a brilliant performance by any actor living or dead, past or present. It was then I know for",positive


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"have you ever sat watching a movie when 20 or 30 minutes have gone by and suddenly you realize that you have actually seen the movie before? that happened to me with "" the young graduates "". the cover of the video box, if you can find the video, is extremely deceiving. i'd swear that the two women on the cover aren't even in the film. < br / > < br / > anyway, i was either born a decade too late to appreciate the finer points of this film or... it is simply pointless junk. i'm heavily leaning toward the latter but i guess some",negative
1,"a retired diplomat, played nicely by michael york, goes to russia to get revenge on the russian gangster that murdered the diplomat's policeman son. there the diplomat meets an exceptionally strong and decent russian cop who helps him bring the russian gangster to justice. < br / > < br / > i remembered the old action flicks of the 1980s that always portray the russians as evil bad guys out to undermine the righteous u. s. government. it's interesting to see this time the russian guy as a hero. < br / > < br / > not a great flick, it's really typically a "" b "" action",negative


=== transfo-xl-wt103 ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"In this movie, supporters (either and not-) just lie about a dramatic situation in our country. < br / > < br / > They did not say that the conflict started because of announcement firing a lot of PDVSA best workers just for political issues. < br / > < br / > They did not say anything about more than 96 TV interruptions transmitted by during only 3 days in """" (a kind of confiscation o private TV signals). Each one with about 20 minutes of duration. < br / > < br / > They did not tell us anything about The announcement made by General",negative
1,"I am terribly sorry, I know that still is called one of the greatest directors in post-war Germany and that most of his films are considered ""master-pieces,"" but when I see ""Lili"" today, in 2004, I wonder what everyone is up and away about this movie! The acting is simply terrible - Hanna is all the smiling like an idiot! -, the between Nazi-glamour and battlefields are ridiculous, the whole film looks as if it was made within two days in an attic. Probably it was exactly that way and many people seem to take this for ""real art,"" but for me",negative


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,so your bairns are away on a sleep-over? the wife is visiting the mother in law? you though are at home. it's a dark and stormy night and there is no football on the telly and the dishwasher needs stacking? so now what are you going to do? < br / > < br / > i will tell you! < br / > < br / > go make an old fashioned cocoa ( frys is best! ) get hold of some ginger nuts and sit down in front of the dvd. now go select and play arthur askeys world war two thriller / horror the,positive
1,"despite its stereotypes, virtually'no-name'cast and an obviously low budget i thought this film was alright ; much better than i expected it to be. i was skeptical at first - the idea of a computer virus that can also infect people seemed a little ludicrous to me. but in the end, i thought the film handled the concept well ( even if some scenes were a little cliched ). < br / > < br / > the cast was quite good, and the two leads seemed to take their roles very seriously. i couldn 't help thinking, though, that janine turner is a",negative


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This is another one of those 'humans vs insects/eco-horror' features; a theme that was popular in the late 70's. Only you can't really call it horror. There's zero suspense and no gruesome events. In other words: this movie is pretty lame. It's not that it's really bad or something; it's just very boring. A construction site near a hotel uncovers a big nest of ants. Later on we learn that, probably due to different sorts of pesticides used in the past, their bi",negative
1,"Bill Maher's Religulous is not an attack on organized religion. It's an attack on Christianity and Islam. Apart from ridiculing a bunch of Rabbis inventing warped machines to get around Sabbath regulations, he really doesn't attack Judaism and seems enraged when a Rabbi actually challenges the existence of the State of Israel. If Bill Maher followed his hypothesis to its logical conclusion, he would realize that the very creation of Israel in the Palestinian Territories is based on the so called 'holy books' of organized religion. This is",negative


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Carnosaur 3 is bad... awfully bad. Bad to the point where it is funny. How matter how much I try to convince myself, I just can't believe anyone in this world could find this entertaining for serious reasons. I mean, come on, even the cover is bad! OK, the special effects are absolutely ridiculous. Those ""Carnosaurs"" are really ridiculous. A scientist tells the soldiers that they move incredibly fast, yet when you see them run, they run at the speed of... an actor in a rubber suit trying to run as much as he can. And",negative
1,"The fact that most of the budget for this presumably went on the heavy-duty cast list shouldn't have mattered if it had been staged with flair and imagination and some sympathy for the original's satirical intent. Instead we get risibly bad song and dance sequences featuring picturesque beggars and whores, and the final alienation is accomplished by pulling back to reveal the action has taken place on a music-hall stage, appropriately enough for a production that's more Lionel 'Oliver' Blair than Brecht. The acting talent is shamefully misused: Migenes and Walters are good",negative


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,albert-base-v1,PASSED,
1,bart,BartTokenizerFast,facebook/bart-base,PASSED,
2,bert,BertTokenizerFast,bert-base-uncased,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,ctrl,CTRLTokenizer,sshleifer/tiny-ctrl,PASSED,
5,camembert,CamembertTokenizerFast,camembert-base,PASSED,
6,convbert,ConvBertTokenizerFast,sarnikowski/convbert-medium-small-da-cased,PASSED,
7,deberta,DebertaTokenizerFast,microsoft/deberta-base,PASSED,
8,deberta_v2,DebertaV2Tokenizer,microsoft/deberta-v2-xlarge,PASSED,
9,distilbert,DistilBertTokenizerFast,distilbert-base-uncased,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
