In [None]:
# default_exp data.core


In [None]:
# all_slow



In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import DataCollatorWithPadding, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset
from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock
from fastai.data.external import untar_data, URLs
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")



What we're running with at the time this documentation was generated:
torch: 1.7.1
fastai: 2.5.3
transformers: 4.13.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")



Using GPU #1: GeForce GTX 1080 Ti


## Mid-level API: Base tokenization, batch transform, and DataBlock methods

In [None]:
# export
class HF_BaseInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    def show(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The "context" associated to the current `show_batch/results` call
        ctx=None,
        # Any truncation you want to apply to the decoded tokenized inputs
        trunc_at: int = None,
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label="text")



A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
# export
class HF_BeforeBatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets, as well as 
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")
        store_attr(self=self, names="max_length, padding, truncation, is_split_into_words, tok_kwargs")
        store_attr(self=self, names="kwargs")

    def encodes(self, samples, return_batch_encoding = False):  # A subset of data to put into a mini-batch
        """This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        if is_listy(samples[0][0]) and not self.is_split_into_words:
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items

        # tokenize
        tok_d = self.hf_tokenizer(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...), ensureing that if 
        # "overflow_to_sample_mapping" = True we include each sample chunk
        d_keys = tok_d.keys()
        updated_samples = []
        if ("overflow_to_sample_mapping" in d_keys):
            for idx, seq_idx in enumerate(tok_d["overflow_to_sample_mapping"]):
                s = (*[{k: tok_d[k][idx] for k in d_keys}], *samples[seq_idx][1:])
                updated_samples.append(s)
        else:
            updated_samples = [(*[{k: tok_d[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]

        if (return_batch_encoding):
            return updated_samples, tok_d
            
        return updated_samples


`HF_BeforeBatchTransform` was inspired by this [article](https://docs.fast.ai/tutorial.transformers.html).

Inputs can come in as a string or a list of tokens, the later being for tasks like Named Entity Recognition (NER), where you want to predict the label of each token.

**Notes re: on-the-fly batch-time tokenization**: The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is less code, faster mini-batch creation, less RAM utilization and time spent tokenizing (really helps with very large datasets), and more flexibility.

In [None]:
# export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
    ):
        store_attr(self=self, names="hf_tokenizer, input_return_type")

    def decodes(
        self,
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples: Type,
    ):
        """Returns the proper object and data for show related fastai methods"""
        if isinstance(encoded_samples, dict):
            return self.input_return_type(encoded_samples["input_ids"], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples


With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here, `HF_AfterBatchTransform`, that will do the decoding for us.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if is_split_into_words:
        return len(example[0])
    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))


In [None]:
#export
@delegates(TfmdDL)
class OverflowDL(SortedDL):
    def __init__(self, dataset, sort_func=None, res=None, overflow_map_key="overflow_to_sample_mapping", **kwargs):
        super().__init__(dataset, sort_func=sort_func, res=res, **kwargs)
        self.overflow_map_key = overflow_map_key
        self.batch_items = None

    def create_batches(self, samps):
        if self.dataset is not None:
            self.it = iter(self.dataset)
        res = filter(lambda o: o is not None, map(self.do_item, samps))

        for b in map(self.do_batch, self.chunkify(res)):
            while self._n_batch_items() >= self.bs:
                yield self._get_batch()

    def do_batch(self, b):
        b = super().do_batch(b)
        self._add_batch(b)

    def _add_batch(self, b):
        if not self.batch_items:
            self.batch_items = b
        else:
            for i in range(len(b)):
                if isinstance(b[i], dict):
                    for k in self.batch_items[i].keys():
                        self.batch_items[i][k] = torch.cat([self.batch_items[i][k], b[i][k]])
                else:
                    self.batch_items[i].data = torch.cat([self.batch_items[i], b[i]])

        # update "n" to reflect the additional samples
        overflow_map = b[0][self.overflow_map_key].numpy()
        self.n += np.sum([i - 1 for i in Counter(overflow_map).values()])

    def _get_batch(self):
        chunked_batch = []

        for i in range(len(self.batch_items)):
            if isinstance(self.batch_items[i], dict):
                chunked_d = {}
                for k in self.batch_items[i].keys():
                    chunked_d[k] = self.batch_items[i][k][: self.bs]
                    self.batch_items[i][k] = self.batch_items[i][k][self.bs :]

                chunked_batch.append(chunked_d)
            else:
                chunked_batch.append(self.batch_items[i][: self.bs])
                self.batch_items[i].data = self.batch_items[i][self.bs :]

        return tuplify(chunked_batch)

    def _n_batch_items(self):
        return len(self.batch_items[0][self.overflow_map_key]) if self.batch_items else 0

    def _one_pass(self):
        self.do_batch([self.do_item(0)])
        b = self._get_batch()
        if self.device is not None:
            b = to_device(b, self.device)
        its = self.after_batch(b)
        self._n_inp = 1 if not isinstance(its, (list, tuple)) or len(its) == 1 else len(its) - 1
        self._types = explode_types(its)


In [None]:
# export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The before batch transform you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm: HF_BeforeBatchTransform = None,
        # The batch_tfms to apply to the creation of your DataLoaders,
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm: HF_AfterBatchTransform = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: DataLoader = None,
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs: dict = {},
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None:
            raise ValueError(
                """You must supply the Hugging Face architecture, config, tokenizer, and model
                - or - an instances of HF_BeforeBatchTransform"""
            )

        if before_batch_tfm is None:
            # if allowing overflow, if we have to ensure mixed batch items are the same shape
            if ("return_overflowing_tokens" in tok_kwargs):
                padding = 'max_length'

            before_batch_tfm = HF_BeforeBatchTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **before_batch_kwargs.copy()
            )

        if after_batch_tfm is None:
            after_batch_tfm = HF_AfterBatchTransform(
                hf_tokenizer=before_batch_tfm.hf_tokenizer, input_return_type=input_return_type, **after_batch_kwargs.copy()
            )

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=before_batch_tfm.hf_tokenizer,
                is_split_into_words=before_batch_tfm.is_split_into_words,
                tok_kwargs=before_batch_tfm.tok_kwargs.copy(),
            )

            # `OverflowDL` is a `DataLoader` that knows how to serve batches of items that are created on the fly as a result
            # of asking the tokenizer to return an input in chunks if the lenght > max_length
            if ("return_overflowing_tokens" in before_batch_tfm.tok_kwargs):
              dl_type = partial(OverflowDL, sort_func=dl_sort_func)
            else: 
                partial(SortedDL, sort_func=dl_sort_func)


        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": before_batch_tfm}, batch_tfms=after_batch_tfm)



A basic wrapper that links defaults transforms for the data block API

`HF_TextBlock` has been dramatically simplified from it's predecessor. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face architecture and tokenizer via the `hf_arch` and `hf_tokenizer` (the other args are optional).

## Low-level API: For working with PyTorch and/or fast.ai Datasets & DataLoaders

Below is a low-level API for working with basic PyTorch Datasets (e.g., a dataset from the Hugging Face datasets library) and DataLoaders. Use the approach detailed below if you already have, or want to use, a plain ol' PyTorch `Dataset` instead of the fast.ai `DataBlock` API.

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)

        store_attr(self=self, names="hf_arch, hf_config, hf_model, tok_kwargs, text_gen_kwargs")
        store_attr(self=self, names="is_split_into_words, ignore_token_id, kwargs")



In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator: BlurrBatchCreator = None,
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm: BlurrBatchTransform = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # (optional) list of corresponding labels names for classes; if included then methods like `show_batch` will
        # show the name corresponding to the label index vs. just the integer index.
        label_names: Optional[list] = None,
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_tfm:
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model, label_names")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory 
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility methods for getting blurr transforms

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = HF_BeforeBatchTransform,
):
    """Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    dls: DataLoaders,  # Your fast.ai `DataLoaders
    before_batch_tfm_class: Transform = HF_BeforeBatchTransform,  # The before_batch transform to look for
    blurr_batch_tfm_class: Transform = BlurrBatchTransform,  # The after_batch (or batch_tfm) to look for
):
    """This convenience method will find the first Blurr transform required for methods such as 
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if tfm:
        return tfm

    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as 
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



## Base `show_batch` method

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x: HF_BaseInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    trg_labels = None
    if hasattr(dataloaders, "label_names"):
        trg_labels = dataloaders.label_names

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.item()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Sequence classification

Below demonstrates both how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input) using the mid-level API, and also with the low-level API should you wish to work with standard PyTorch or fast.ai Datasets and DataLoaders

### Using the mid-level API

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path("models")
imdb_df = pd.read_csv(path / "texts.csv")


In [None]:
imdb_df.head()


Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


#### Basic use

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 497]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"I'm not going to criticize the movie. There isn't that much to talk about. It has good animal actions scenes which were probably pretty astonishing at the time. Clyde Beatty isn't exactly a matinée idol. He's a little slight and not particularly good looking. But that's OK. He's the man in that lion cage. We know that when he can't take the time away from his lions to tend to his girlfriend, he will end up on an island with her and have to save the day. Someone said earlier that it is a history",negative
1,"Well, what are the odds! At the exact right moment that a few redneck amateur-scientists discover cave paintings indicating that some type of dinosaur monster might have inhabited the area thousands of years ago, a burning meteor crashes into the lake and spontaneously hatches a monster's egg that has been lying there for over a thousand years, I suppose! ""The Crater Lake Monster"" is a movie that literally must be seen to be believed, but you better do so in the company of many friends and a",negative


#### Working with long documents

There are two options when dealing with texts longer than your model can handle.

First, as illustrated above, you can simply provide a `truncation` strategy to ensure they are <= the maximum length your model can handle.

Second, in the case we want to process the entirety of each document regardless of length, we can split text greater than the max length allowed by our model and then treat each of these chunks as separate examples. This approach is accomplished by setting `"return_overflowing_tokens": True` into our tokenizer function's via `tok_kwargs`. 

While the second approach is traditionaly performed as part of the data preprocessing, blurr can do this on-the-fly when using it's `OverflowDL` DataLoader (which is automatically used by blurr when you pass  `"return_overflowing_tokens": True` in the `tok_kwargs` argument of  `HF_TextBlock`.  Below is an example of how this works.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

In [None]:
blocks = (
    HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=128, tok_kwargs={"return_overflowing_tokens": True, "stride": 2}),
    CategoryBlock,
)

dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())

dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)

Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"yllic storyline would make the film critic proof. He was right, but it didn't fool me. Raising Victor Vargas is the story about a seventeen-year old boy called, you guessed it, Victor Vargas (Victor Rasuk) who lives his teenage years chasing more skirt than the Rolling Stones could do in all the years they've toured. The movie starts off in `Ugly Fat' Donna's bedroom where Victor is sure to seduce her, but a cry from outside disrupts his plans when his best-friend Harold (Kevin Rivera) comes-a-looking for him. Caught in the",negative
2,"in the attempt by Harold and his sister, Victor Vargas runs off for damage control. Yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young Victor from going off on the hunt for more fresh meat. On a hot, New York City day they make way to the local public swimming pool where Victor's eyes catch a glimpse of the lovely young nymph Judy (Judy Marte), who's not just pretty, but a strong and independent too. The relationship that develops between Victor and Judy becomes the focus of the film. The story also focuses on",negative
3,"focuses on Victor's family that is comprised of his grandmother or abuelita (Altagracia Guzman), his brother Nino (also played by real life brother to Victor, Silvestre Rasuk) and his sister Vicky (Krystal Rodriguez). The action follows Victor between scenes with Judy and scenes with his family. Victor tries to cope with being an oversexed pimp-daddy, his feelings for Judy and his grandmother's conservative Catholic upbringing.<br /><br />The problems that arise from Raising Victor Vargas are a few, but glaring errors. Throughout the film you get to know",negative


### Using the low-level API

Step 1: Grab your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
raw_datasets["train"].features
raw_datasets["train"]


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Step 2: Define any pre-processing that needs to be done to your datasets (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


Step 3: Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets.

Setting the `label_names` argument to a list of label names corresponding to each class's index will ensure the methods like `show_batch` and `show_results` print the name of the class rather than just its index.

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    label_names=label_names,
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    label_names=label_names,
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape



torch.Size([8, 57])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Massachusetts is one of 12 states that does not have the death penalty, having abolished capital punishment in 1984. Massachusetts is one of 12 states without the death penalty, having abolished it in 1984.",equivalent
1,"St. Paul Chairman and Chief Executive Jay S. Fishman, 51, will be CEO of the combined company. Jay Fishman, 51, chairman and chief executive of St Paul, will be chief executive of the combined company.",equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "albert-base-v1",
    "facebook/bart-base",
    "bert-base-uncased",
    "google/bigbird-roberta-base",
    "sshleifer/tiny-ctrl",
    "camembert-base",
    "sarnikowski/convbert-medium-small-da-cased",
    "microsoft/deberta-base",
    "microsoft/deberta-v2-xlarge",
    "distilbert-base-uncased",
    "monologg/electra-small-finetuned-imdb",
    "flaubert/flaubert_small_cased",
    "huggingface/funnel-small-base",
    "gpt2",
    "kssteven/ibert-roberta-base",
    "allenai/led-base-16384",
    "microsoft/layoutlm-base-uncased",
    "allenai/longformer-base-4096",
    "sshleifer/tiny-mbart",
    "microsoft/mpnet-base",
    "google/mobilebert-uncased",
    "openai-gpt",
    #'reformer-enwik8',                  # (see model card; does not work with/require a tokenizer so no bueno here)
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    #'google/tapas-base',                # (requires pip install torch-scatter)
    "transfo-xl-wt103",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path("models")
imdb_df = pd.read_csv(path / "texts.csv")



In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

    dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
    dls = dblock.dataloaders(imdb_df, bs=bsz)
    b = dls.one_batch()

    try:
        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))



=== albert-base-v1 ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some of the best movies that are categorized as ""comedies"" actually blur between comedy and drama. ""the graduate"" and ""butch cassidy and the sundance kid"", which were made also in the late 1960's are perfect examples. are they comedies with dramatic undertones, or dramas with a lot of humor? in many respects, ""the odd couple"" falls into this same category of being both comedy yet highly dramatic with deep underpinnings about human nature. much of what happens may be funny to the audience but the characters are not laughing.br /br",positive
1,"there's hardly anything at all to recommend this movie. chase masterson is always nice to look at and actually can act, though her role in this clunker is a waste. unfortunately the rest of the cast ranges from bad to mediocre. in a lot of films like this someone will shine through the material and you make a note of them for future reference. no such luck here. creature unknown"" a cliched monster-on-the-loose flick with the kids getting knocked off one after the other. the monster is a man in a rubber suit which hearkens back to the days of paul",negative


=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,I really enjoyed Girl Fight. It something I could watch over and over again. The acting was Fantastic and i thought Michelle Rodriguez did a good job in the film. Very convincing might I say. The movie is showing how women should stand up for what they want to do in life. She had so much compassion and yet so much hate at the same time. Dealing with a ignorant dad didn't really help her much. Even though he loved her he was really hateful. Her mother died when she was younger and that also put some sadness in the role. The love story was a part that i really enjoyed in the movie also,positive
1,"Kate Beckinsale steals the show! Bravo! Too bad Knightly ins't as good looking as Jeremy Northam. Mark Strong did a fabulous job. Bernard Hepton was perfect as Emmas father. I love the end scene (which is an addition to the novel-but well written) when the harvest is in and Knightly dines with his workers and high society friends. Emma must show that she accepts this now. She is a changed woman. That is too much too quick, but OK. I'll buy into it. Samantha Bond plays Emma's ex-governess and confidant. She is wonderful. just",positive


=== bert-base-uncased ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i was pleasantly surprised i quite liked this movie. witty writing ( some "" inside "" jokes i got, others i didn't - maybe due to actors speaking on top of one another ), great acting ( notably john cassini ), great cameos, interesting and unique directing. i rented it to see jeffrey meek ( very disappointed he was in it such a short time, blink and you'll miss him! ) but found the movie remarkably entertaining. i'll actually watch it again before i send back to netflix. i think actors and wanna - be actors will thoroughly enjoy this movie. the ending is somewhat expected but",positive
1,"sometimes they get lucky and have a hit on their hands ( wayne's world, the first one, not the second ). but most often they have duds ( it's pat comes to mind rather quickly ). this time out it's tim meadows as the ladies man. this movie falls somewhere in between a hit and a dud. it was very funny for the first 20 minutes, but then, as usually happens with snl skits, it starts to slow down, before finally ending, long after it should have. < br / > < br / > tim meadows is leon phelps, a radio dj",positive


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Val Kilmer and Dylan McDermott are terrific. I have seen Kilmer on The Doors, however his interpretation of John Holmes is superb. Nothing compared to Boogie Nights which was kind of slow. Wonderland is a movie which is able to show you a horrible crime story from the perspective from a guy who is just indulged in his drug vice and indolent of what ever happens around. At the same time, the John Holmes character shows a very clever hustler who is able to pass through the nastiest and ugliest situations almost unharmed. The movie deserves being watched more than once. The seventies amb",positive
1,"I watched this movie, and hoped for something to get better the entire time. What is so great about a guy with no emotion? *yawn*<br /><br />You never see Alex show emotion for anyone other than his son. Yeah, I know that this is why his son is the only one to cause him to lose his temper (if you can call it that), I get it.<br /><br />Characters are undeveloped, relationships aren't given enough time to be understood. In one scene Sarah says they won't fall in love, and the next time we see her she's talking about",negative


=== sshleifer/tiny-ctrl ===



Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Dreck about three beautiful women in California who go to cover some festival (or something). All the hotels are booked so they have to spend the night in a creepy old house. What they don't know is that there is a creepy inhabitant there who likes to kill...<br /><br />Yawn. Boring, pointless, utterly stupid ""horror"" film. Bach and her two buddies are certainly beautiful but the movie itself is dull dull DULL! Bach and her friends are no actresses--their faces are blank all the way through. The final ""revelation"" is laughably predictable and there's no blood or gore to keep you interested along the way.",negative
1,"great mystery, but the film goes down hill from there. The beginning is promising with a car wreck and a woman and her daughter being burned alive in front of a police officer, Edward. He is traumatized over this and is seen popping pills. A mysterious letter turns up from an old girlfriend asking for help in finding her missing daughter. So Ed travels to an island commune of mainly woman. They don't like outsiders. A lot of filler is with Ed shown looking around town for the girl. That made the movie too long. It finally gets a little better toward the end when we learn of the crazy rituals the woman perform and finally of the sinister plan in store for Edward.<b@@",negative


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"And that's why historic/biographic movies are so important to all of us, moreover when they are so well done, like this one!<br /><br />Before I saw ""The Young Victoria"", I knew a few things about Queen Victoria, but in the end I got much more knowledge about it. <br /><br />Emily Blunt is simply GREAT as Victoria (Who would guess that!) and She probably will get a nomination at",positive
1,"Only the chosen ones will appreciate the quality of the story and character design of this movie. Superior ancients that dwell in the lands of lore far beyond any average human creature's understanding. This movie pulls the adventure genre into a unique centrifugal magical force of fantasy unto thee mystical crystals of chalice. Stories come and go, but the idea for a good story is to think positive, not negative though",positive


=== sarnikowski/convbert-medium-small-da-cased ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Silverlake Life, The view from here, is an absolutely stunning movie about AIDS as well as about a gay love relationship. Some images are indeed really hard to take, especially when one is gay or fears about AIDS, and probably for any sensitive person watching it. It's not easy to make a movie about such a terrible illness and its consequences about not only one, but two people's",positive
1,"A retired diplomat, played nicely by Michael York, goes to Russia to get revenge on the Russian gangster that murdered the diplomat's policeman son. There the diplomat meets an exceptionally strong and decent Russian cop who helps him bring the Russian gangster to justice. < br / > < br / > I remembered the old action flicks of the 1980s that always portray the Russians as evil bad guys out to undermine the",negative


=== microsoft/deberta-base ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Well, because I'm a musician I thought, maybe I'll check this movie out on TMC, nothing else good on. One of the worst mistakes of my life so far, and it's only half done. I seriously thought it was one of those soft core movies with crappy plot and crappy acting, crappy filming and crappy effects. But nope, I don't even get the pleasure of that. Even the ""musicians"" weren't very good. I was hoping for maybe some laughs, but I wasn't sure if they were attempting to throw in one-liners or not. But now I have to sit here",negative
1,"I really enjoyed this episode, which was a great surprise given the bad reputation it seems to have acquired. From a pure writing perspective, 'The 16mm shrine' is an absolute treat, with fantastic dialogue and character analysis, typical of Sterling. In particular I really enjoyed the philosophical indulgences of the episode, tackling themes of existence and reality, whilst balancing it with more psychological topics such as denial, pride, and desire. 'The sixteen-millimeter shrine' is an episode about how these ideas based around an unwillingness to accept change can seemingly alienate a person from the rest of the ever-changing world. It is",positive


=== microsoft/deberta-v2-xlarge ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I've always knew Anne DeSalvo was a great character actor, now I know she is a great writer/director also. I have been a fan since I first saw her in the movies ""Perfect"", ""My Favorite Year"", ""DC Cab"" and ""Stardust Memories"".<br /><br />It's so rare to see Lee Grant these days in anything. She has been missing from the screen for far too long. It's also wonderful to see Cloris Leachman in something other than a sit-com. This is her best work since ""the Last Picture Show"". If you grew up in",positive
1,"This is a very dramatic and suspenseful movie. There are many plots and turns. The story or the director opens question marks on the death row or presumed crimes committed by black people. This film is very well directed by Arne Glimcher and the fine sound of James Newton Howard is excellent. Strong performance of Sean Connery and Ed Harris. If you liked this one don ́t miss ""TRUE CRIMES"" or ""THE HURRICANE"". My wife and me gave 8/10.",positive


=== distilbert-base-uncased ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the first half was ok, but the last half really, really disappointed. it's funny the producers even admitted they didn't have a clue for the ending, and it really showed. whats really sad is i have to write ten lines of comment minimum to be able to post this. i really didn't want to include spoilers to qualify my remarks since the show isn't really worth that effort. when battlestar galatica first came out i was really excited with the prospect of a better remake, it didn't happen that first season border on being space porn. they eventually cleaned it up a bit",negative
1,"what the * bliep * is it with this movie? couldn't they fiend a better script? all in all a'nice'movie, but... it has been done more than once... up till the end i thought it was okay, but... the going back to the past part... * barf * so corny... was waiting for the fairy god mother to appear... but wow, that didn't happen... which is good. < br / > < br / > i loved big with tom hanks, but to see",negative


=== monologg/electra-small-finetuned-imdb ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i should have known i was in trouble with casper van diem as the lead character. words cannot describe, nor do they do justice to just how terrible this movie was. but please allow me to try to describe it : horrible acting, terrible dialog, corny situations and through it all you get the feeling that you are being force - fed the beliefs and propeganda from the trinity broadcasting network. its a weak attempt at trying to show hollywood that a movie can be entertaining and have a deep, religious message attached to it. they failed miserably. it was clearly the worst movie i have seen in a long",negative
1,"had fun watching this film.. despite the feeling i got a lot of the time, that this film was almost copying monsters inc. there're quite a few things that are extremely similar between the two, the relationship between an animal / monster and a small child, other animals trying to break that relationship, etc. it felt like that pretty much throughout the film, to me. < br / > < br / > one of the redeeming features though, is scrat : ) very very funny character, even if he serves no purpose : )",positive


=== flaubert/flaubert_small_cased ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I saw this movie recently. 2 hours later, my head still hurt from laughing. The plot was soo awful, the jokes were soo bad, but what I didn' t count on were : < br / > < br / > 1. the 2 scenes before and after the movie that had Pat and Jay posing ( that caused more than enough laughter ) < br / > < br / > 2. The kick through the windshield that decapitated the evil-doer. < b",positive
1,"Silverlake Life, The view from here, is an absolutely stunning movie about AIDS as well as about a gay love relationship. Some images are indeed really hard to take, especially when one is gay or fears about AIDS, and probably for any sensitive person watching it. It' s not easy to make a movie about such a terrible illness and its consequences about not only one, but two people' s lives. This movie teaches how to care for each other in such",positive


=== huggingface/funnel-small-base ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"being half - portuguese doesn't render me half - blind ( nor half - prejudiced ) when discussing portuguese films. not that i get to do that very often anyway. but this film was such a rush of adrenaline! yes, that's right - it was mostly accurate as far as history went / goes - but it pulled no punches on venturing beyond usual portuguese - film territory : things like using real locations in the middle of traffic - congested lisbon and recruiting a real crowd to stand in for the real crowd of almost 30 years ago. and by god did they get it right! ok, to sum it up",positive
1,ko to tamo peva is the best comedy of all times. believe me i saw a lot of movies and comedies but tell me which one make you smile every time you watching it. but truth is that the humour in this comedy is special. it is caratherisic for serbia. and all former republic of yugoslavia know it very well!!! so i think the rest of audience ( for example : in europe ) can't enjoy it so much. because the subtitles ruin the hole thing. but they should at least try!!!! yes it is ironic! this is the best flick in serbian history,positive


=== gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Such great actors such a disappointment. Marlon Brando plays and awful character, the movie is not funny at all, a subconscious message can be seen ""IT IS A DAMN CRAP!!!"", ""IT SUUCKS SO BADLY!!"", ""THROW YOUR TV THROUGH WINDOW"", and so on. It is simply disgusting and irksome. In addition to foolish plot, sense of humor, there is something else. The way the rooms are decorated, the colors. It makes me sick, everything is so colourful that it might cause epilepsy. Usually I do not care about the decoration in movie but this from",negative
1,"I was very excited about seeing this film, anticipating a visual excursus on the relation of artistic beauty and nature, containing the kinds of wisdom the likes of ""Rivers and Tides."" However, that's not what I received. Instead, I get a fairly uninspired film about how human industry is bad for nature. Which is clearly a quite unorthodox claim.<br /><br />The photographer seems conflicted about the aesthetic qualities of his images and the supposed ""ethical"" duty he has to the workers occasionally peopling the images, along the periphery. And frankly, the images were not generally that impressive. And according to this",negative


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"This is a very old and cheaply made film--a typical low-budget B-Western in so many ways. Gary Cooper was not yet a star and this film is highly reminiscent of the early films of John Wayne that were done for ""poverty row"" studios. With both actors, their familiar style and persona were still not completely formed. This incarnation of Gary Cooper doesn't seem exactly like the Cooper of just a few years later (he talks faster in this early film, among other things).<br /><br />However, unlike the average B-movie of the era, there are at least a few interesting elements",negative
1,"I first encountered this show when I was staying in Japan for six months last year. I found it in the internet when I was looking for sub-titled dramas to help me with my Japanese. My host mother warned me to stay away from it because she thought it was weird, but I found it delightful! Koyuki showed such conflicting character traits and Matsujun's spirit made my day every time I tuned in! I first saw him on ""Hana Yori Dango"", but I liked him much better in this!<br /><br />Although the characters are interesting and well-developed, I was",positive


=== allenai/led-base-16384 ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"My mother worked with Dennis L. Raider for eleven years, not to mention shared an office with him. When it was announced he was BTK, she was shocked. The whole day was just her telling stories about how she never would have seen him as the Wichita Killer. I've heard her re-tell them many times. I've inquired her about a lot of things, and gone to all the interviews that she was asked to go to. I've read the entire book written about Raider, Wichita is my hometown and I was surprised that such a thing could happen in Kansas.<br /><br />There was another BT",negative
1,"As anyone old enough knows, South Africa long suffered under the vile, racist oppression of apartheid, which completely subjugated the black population. One of the most famous anti-apartheid activists was Steve Biko, who was murdered in jail. Following the murder, reporter Donald Woods sought to get Biko's message out to the world.<br /><br />In ""Cry Freedom"", Woods (Kevin Kline) befriends Biko (Denzel Washington) before the latter is arrested on trumped up charges. When Woods attempts to spread Biko's word, he and his family begin living under threat of attack, and",positive


=== microsoft/layoutlm-base-uncased ===

architecture:	layoutlm
tokenizer:	LayoutLMTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"dominion tank police is without a shell of a doubt, one of the most amazing shows ever produced, but not just in the field of animation. while the first part ( acts 1 and 2 ) mostly consists of action and fun, the second part is more serious and one should not treat the second part in the exact same way as first part. the subtleties are truly out of this world and the characterization is beyond brilliant. you must have an extra degree of intelligence to appreciate the intricacies of the second part ( acts - 3 and 4 ). i do have some complaints though. in the first part, the tank bonaparte",positive
1,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative


=== allenai/longformer-base-4096 ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"The first half was OK, but the last half really, really disappointed. It's funny the producers even admitted they didn't have a clue for the ending, and it really showed. Whats really sad is i have to write ten lines of comment minimum to be able to post this. I really didn't want to include spoilers to qualify my remarks since the show isn't really worth that effort. When Battlestar galatica first came out I was really excited with the prospect of a better remake, it didn't happen that first season border on being space porn. They eventually cleaned it up a bit and actually had some pretty",negative
1,"Anita and Me seems to be little more than an excuse for Meera Syal, the author of the novel and screenplay, to air her prejudices, grievances and general antipathy towards the English. The general sentiment of Indian superiority over the English in this film is foul.<br /><br />The English people in this film are portrayed as overweight, violent, foul-mouthed, promiscuous, engaging in child neglect, stupid, uneducated, racist, ugly, eating poor food, and dim-witted -- tellingly, only by turning to Indian culture can the local priest be ""redeemed"" at",negative


=== sshleifer/tiny-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Watching It Lives By Night makes you wonder, just who in the world greenlit this crap. A newlywed couple go spelunking on their honeymoon, get attacked by bats and the husband starts to run around in his pajamas attacking various people. And where exactly are they? They're in the desert, then they're skiing, then they're in a small town that looks like it has mountains nearby. The town is run by a sheriff who likes to watch and has a personal vendetta against whiny doctor boy. The ski hospital",negative
1,"Thin story concerns two small town brothers and their struggles over family honor. David Morse is the responsible, straight-laced cop and 'good' brother; Viggo Mortensen, the 'bad' boy, is a former soldier and ex-convict. As an actor (particularly in his earliest years), Sean Penn seems to have modulated his performances under the Method. Turning first-time writer and director for this arty, obtuse drama, he works his script and characters out through the same methodical process, slowing the pacing down to",negative


=== microsoft/mpnet-base ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the only redeeming part of this movie was the price i paid. at least all i lost was $ 3. 00 and the time elapsed sitting through this bomb. the crew member who was in charge of continuity missed the boat. when the female lead and the fbi guy went to the alleged killers location, mr. fbi handed the female a revolver. when the alleged killer came out the door, the revolver has magically transformed into an automatic. one is left to ponder would an fbp agent hand a weapon to a civilian? i think not. ms. xavier appears to be a very attractive female. it",negative
1,"i was excited to view a cataluna´s film in the berlin´s competition. but after the presentation i was total disappointed and furious. too much blood, too much time, too much themes for nothing. the spanish civil war, like every war, was horrible. the revenge, a very human behavior, not pretty at all, is shown in uncountable films and plays, as well as the relations between homosexuals and the scepticism in spain about catholicism. but what mr villaronga try, is a pseudo tragedy that can belongs to the worst of the film´s history. it is really a pity",negative


=== google/mobilebert-uncased ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"have you ever sat watching a movie when 20 or 30 minutes have gone by and suddenly you realize that you have actually seen the movie before? that happened to me with "" the young graduates "". the cover of the video box, if you can find the video, is extremely deceiving. i'd swear that the two women on the cover aren't even in the film. < br / > < br / > anyway, i was either born a decade too late to appreciate the finer points of this film or... it is simply pointless junk. i'm heavily leaning toward the latter but i guess some",negative
1,"this is the most disturbing film i have ever seen. it makes "" requiem for a dream "" look like a disney film. although, technically, it is reasonably well made, acting, cinematography, music, directing, etc., are good. however, the concluding gang rape scene is the most appalling and violent thing i have ever seen and i really wish i had not seen it. i am afraid that it will haunt me for the rest of my life. although i think anyone would find the film extremely disturbing, my wife and some of her friends were victimized in a very similar manner and i really didn't",negative


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i was required to watch the movie for my work, so i didn't pay for it ( on the contrary, i got paid ), but i still found the movie to suck far more than average. the jokes were lame, the two lead actresses... well, to use the "" first wives club "" division of women's ages in hollywood, they are no longer in their "" hot chick "" age but more in their "" district attorney "" age. what angered me most about the movie was the main plot line, which pretty much completely plagiarized "" beavis & butthead do america "" ( in which the",negative
1,"want to watch a scary horror film? then steer clear of this one. there's not enough beer in the world to make this film enjoyable. < br / > < br / > however, there is enough scotch. single - malt, if you can manage it. < br / > < br / > if the previous comments weren't enough to keep you from watching this film sober, allow me to assist. nasa sends one man and two unpaid extras into space to orbit saturn. a really big solar flare causes colonel steve west to bleed from the nose. things go downhill from there, and wackiness ensues.",negative


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Does any one know what the 2 sports cars were? I think Robert Stack's might have been a Masseratti.Rock Hudson's character told his father he was taking a job in Iraq,isn't that timely? I have had Dorthy Malone in my spank bank most of my life,maybe this was the film that impressed me.Loren Bacall sure did have some chops in this film and probably out-acted Malone but Malones's part made a more sensational impact so she got the Oscar for best supporting role.Was Loren's part considered a leading role?Old man Hadley character was was probably a",positive
1,"Despite pretty bad reviews, I just had to give this film a go  it does, after all, star HK super-babe Shu Qi plus 6 other oriental lovelies as a team of all-action cat-burglars. Surely that's worth checking out? Well, as babe-fests go, Martial Angels is hard to beat. The eye candy is top quality. Shu Qi looks as fantastic as always, and of the rest of the girls, Rosemary Vandebrouck and Amanda Strang caught my roving eye in particular.<br /><br />Unfortunately, if one is to",negative


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"this was a horrible film! i gave it 2 points, one for angelina jolie and a second one for the beautiful porsche in the beginning... other than that the story just plain sucked and cars racing through cities wasn't so new in 1970. the happyend was probably what annoyed me the most, seldomly seen anything so constructed!",negative
1,"ah, a kelly / sinatra sailor - suit musical. so familiar, right? yes, but this isn't the one you usually hear about. on the town's that - a - way. but if you stick around, you might learn something. okay, probably not. anyway, anchors aweigh tells the story of two sailors on a three - or four - day leave. joe is the "" sea wolf "" and clarence, the bookish type, begs joe to get him a "" dame "". now, after they're picked up by the coppers they get little donald home. that's where they",positive


=== transfo-xl-wt103 ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"The case of the Scorpion's tail is a highly stylish giallo directed by Sergio Martino, who appears to be a giallo master second only to Dario Argento. < br / > < br / > Ernesto Gastaldi wrote this fabulous who-, quite complex but ultimately very satisfying and entertaining murder mystery. It also makes sense in the end, a big plus, 'cause that's not always the case for these giallo's, as they tend to stretch credibility with their endless red-herrings and ultimate solutions. Here, the less you know about the plot, the better. < br / > < br / > Pure giallo trademarks present here are the beautiful",positive
1,"John Carradine, John Ireland, and Faith who as players all saw better days in better films got together for this Grade G horror film about life imitating art in a mysterious mansion. < br / > < br / > For Carradine it was in those last two decades of his career that he appeared in anything on the theory it was better to keep working no matter what you did and get those paychecks coming in. With that magnificent sonorous voice of his, Carradine was always in great demand for horror pictures and the man did not discriminate in the least in what he appeared in. < br / > < br / > He plays the caretaker of an",negative


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i just watched this movie and have to say, i was very impressed. it's very creepy and has numerous moments that will make you jump out of seat! i had to smoke several "" emergency "" cigarettes along the way to calm my nerves! if i had to criticise, i 'd say that perhaps if anything, there were too many jump moments. it got to the point where every single new scene climaxed with a jump and this gradually wore away the startling effect, because you kind of new what was coming. < br / > < br / > although it contains virtually every cliche in the ghost genre,",positive
1,"del - "" you are the dumbest smart person i've ever met. "" < br / > < br / > calvin- "" well, i had a brain, but they lost it in the re-writes. "" < br / > < br / > i think what i find most egregious about this bastardization of asimov's work was how the character of susan calvin was portrayed. in the books, she was actually one of the first strong female protagonists, able to think her way through a problem. here she's just a damsel in distress, waiting to be rescued by wil smith.",negative


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"'Stanley and Iris' show the triumph of the human spirit. For Stanley, it's the struggle to become literate and realize his potential. For Iris, it's to find the courage to love again after becoming a widow. The beauty of the movie is the dance that Robert DeNiro and Jane Fonda do together, starting and stopping, before each has the skills and courage to completely trust each other and move on. In that sense it very nicely gives us a good view of how life often is, thus being credible. Unlike some other reviewers I found the characters each",positive
1,"If you loved Long Way Round you will enjoy this nearly as much. It is educational, funny, interesting and tense. Charley shares the screen with two interesting teammates, two tired mechanics, two excellent cameramen and too much Russ. Ewan makes a few appearances but Charley really pulls it off alone. He is funny, engaging and still a puddle of stress and doubt. Great stuff!<br /><br />The series wraps up in 7 episodes. Like LWR, the preparation is nearly as interesting as the race. Though they cover the ins and out",positive


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,This movie was the most out of line and liberally fed movie i have ever seen in my life. (Besides Farenheit 9/11). All of the information was only supported on the opinion of FIVE scientists while 80% of the Asssociated Press highly criticize the science promoted be Gore. Global Warming is a Mass Media Hysteria and nothing more. Most of the information in the movie was either misquoted or it was wrong all together. THis movie has been investigated over and over again and has been shown evidence against that prove its lies were nothing but lies.<br />,negative
1,"Ho-hum. An inventor's(Horst Buchholz)deadly biological weapon is in danger of falling into the wrong hands. Unknowingly his son(Luke Perry)has been working on the antedote all along. Enter CIA agent Olivia d'Abo and the cat-and-mouse car chases and gunfire begins. Also in the cast are:Tom Conti, Hendrick Haese and an aging Roger Moore. Moore seems to haggardly move through this mess definitely not one of his better efforts. Perry fans will be accepting. d'Abo is wrong for",negative


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,albert-base-v1,PASSED,
1,bart,BartTokenizerFast,facebook/bart-base,PASSED,
2,bert,BertTokenizerFast,bert-base-uncased,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,ctrl,CTRLTokenizer,sshleifer/tiny-ctrl,PASSED,
5,camembert,CamembertTokenizerFast,camembert-base,PASSED,
6,convbert,ConvBertTokenizerFast,sarnikowski/convbert-medium-small-da-cased,PASSED,
7,deberta,DebertaTokenizerFast,microsoft/deberta-base,PASSED,
8,deberta_v2,DebertaV2Tokenizer,microsoft/deberta-v2-xlarge,PASSED,
9,distilbert,DistilBertTokenizerFast,distilbert-base-uncased,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
