In [None]:
#|default_exp text.data.core
#|default_cls_lvl 3


In [None]:
#| nbflags skip_exec


In [None]:
#|hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# text.data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to turn your raw datasets into modelable `DataLoaders` for text/NLP tasks

In [None]:
#|export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from datasets import Dataset, load_dataset, concatenate_datasets
from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
    logging,
)

from blurr.text.utils import get_hf_objects

logging.set_verbosity_error()


In [None]:
#| echo: false
import pdb

from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, ItemGetter, RandomSplitter
from fastcore.test import *
from nbdev.showdoc import show_doc

from blurr.text.utils import BlurrText
from blurr.utils import print_versions

NLP = BlurrText()


os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.12.0
fastai: 2.7.7
transformers: 4.20.1


In [None]:
#|hide
#|cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: NVIDIA GeForce RTX 3080


## Setup

We'll use a subset of `imdb` to demonstrate how to configure your BLURR for sequence classification tasks

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()


Reusing dataset imdb (/home/dev/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,text,label,is_valid
0,"I guess when ""Beat Street"" made a national appearance, ""Flashdance"" came at the same time. The problem with ""Flashdance"" is that there was only one break dancing scene and the rest was jazz dance and ballet. That was one of the reasons why ""Beat Street"" was better. The only movie that could rival ""Beat Street"" seems to be ""Footloose"", because both movies focused on how dance had been used by people to express their utmost feelings.<br /><br />The break-dance scenes in ""Beat Street"" come just before the middle and at the end of the flick. And I loved all of them. Almost all of the break tri...",1,False
1,"I was lucky enough to watch this without any pre viewing hype. I was surprised at the resilience of the ghost's image in my mind the next day, and the day after that. I've watched it 3-4 times, and each time I appreciate it even more. The settings are gorgeous, the town at dusk has beautiful lighting effects, the marsh long shots, and the house itself is sufficiently grown with moss. The main hero is so likable and good natured, that he is easily sympathized with. To the person who complained that there wasn't enough 'spark' in this film, I'd say that it's because the whole fight against t...",1,False
2,"Really bad. Why anyone thinks this is a good film let alone funny is a true mystery. I like comedies as much as the next man and I LOVED ""A Christmas Story."" The fact that it has the same director and was based on the same writer's memoirs has me completely puzzled as to why this film is such a complete failure on every level. Charles Grodin is woefully miscast as the father for starters. For another it does not seem to have the same pacing -- it just doesn't flow well. Everything seems tired and forced. The joy of life that permeated the first film is completely absent here -- you just wa...",0,False
3,"I remember the original series vividly mostly due to it's unique blend of wry humor and macabre subject matter. Kolchak was hard-bitten newsman from the Ben Hecht school of big-city reporting, and his gritty determination and wise-ass demeanor made even the most mundane episode eminently watchable. My personal fave was ""The Spanish Moss Murders"" due to it's totally original storyline. A poor,troubled Cajun youth from Louisiana bayou country, takes part in a sleep research experiment, for the purpose of dream analysis. Something goes inexplicably wrong, and he literally dreams to life a swa...",1,False
4,"I LOVED this movie! I am biased seeing as I am a huge Disney fan, but I really enjoyed myself. The action takes off running in the beginning of the film and just keeps going! This is a bit of a departure for Disney, they don't spend quite as much time on character development (my husband pointed this out)and there are no musical numbers. It is strictly action adventure. I thoroughly enjoyed it and recommend it to anyone who loves Disney, be they young or old.",1,False


In [None]:
labels = raw_datasets[0].features["label"].names
labels


['neg', 'pos']

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "roberta-base"  # "bert-base-multilingual-cased"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, wil

('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification)

## Preprocessing

Starting with version 2.0, `BLURR` provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

### `Preprocessor` -

In [None]:
#|export
class Preprocessor:
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute holding the text_pair
        text_pair_attr: Optional[str] = None,
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        self.hf_tokenizer = hf_tokenizer
        self.batch_size = batch_size
        self.text_attr, self.text_pair_attr = text_attr, text_pair_attr
        self.is_valid_attr = is_valid_attr
        self.tok_kwargs = tok_kwargs

        if "truncation" not in self.tok_kwargs:
            self.tok_kwargs["truncation"] = True

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = training_df.copy()

        # concatenate the validation dataset if it is included
        if validation_df is not None:
            valid_df = validation_df.copy()
            # add an "is_valid_col" column to both training/validation DataFrames to indicate what data is part of the validation set
            if self.is_valid_attr:
                valid_df[self.is_valid_attr] = True
                df[self.is_valid_attr] = False

            df = pd.concat([df, valid_df])

        return df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = training_ds

        # concatenate the validation dataset if it is included
        if validation_ds is not None:
            # add an "is_valid_col" column to both training/validation DataFrames to indicate what data is part of
            # the validation set
            if self.is_valid_attr:
                validation_ds = validation_ds.add_column(self.is_valid_attr, [True] * len(validation_ds))
                training_ds = training_ds.add_column(self.is_valid_attr, [False] * len(training_ds))

            ds = concatenate_datasets([training_ds, validation_ds])

        return ds

    def _tokenize_function(self, example):
        txts = example[self.text_attr]
        txt_pairs = example[self.text_pair_attr] if self.text_pair_attr else None

        return self.hf_tokenizer(txts, txt_pairs, **self.tok_kwargs)


### `ClassificationPreprocessor` -

In [None]:
#|export
class ClassificationPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # Whether the dataset should be processed for multi-label; if True, will ensure `label_attrs` are
        # converted to a value of either 0 or 1 indiciating the existence of the class in the example
        is_multilabel: bool = False,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute holding the text_pair
        text_pair_attr: Optional[str] = None,
        # The attribute holding the label(s) of the example
        label_attrs: Union[str, List[str]] = "label",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # A list indicating the valid labels for the dataset (optional, defaults to the unique set of labels
        # found in the full dataset)
        label_mapping: Optional[List[str]] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}
        super().__init__(hf_tokenizer, batch_size, text_attr, text_pair_attr, is_valid_attr, tok_kwargs)

        self.is_multilabel = is_multilabel
        self.id_attr = id_attr
        self.label_attrs = label_attrs
        self.label_mapping = label_mapping

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # convert even single "labels" to a list to make things easier
        label_cols = listify(self.label_attrs)

        # if "is_multilabel", convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_col in label_cols:
                df[label_col] = df[label_col].apply(lambda v: int(bool(max(0, round(v)))))

        # if a "label_mapping" is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_col in label_cols:
                df[f"{label_col}_name"] = df[label_col].apply(lambda v: self.label_mapping[v])

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs
        inputs = self._tokenize_function(batch_df.to_dict(orient="list"))

        for txt_seq_idx, txt_attr in enumerate([self.text_attr, self.text_pair_attr]):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(inputs["offset_mapping"]):
                text_offsets = [offset_mapping[i] for i, seq_id in enumerate(inputs.sequence_ids(idx)) if seq_id == txt_seq_idx]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [batch_df, pd.DataFrame(char_idxs, columns=[f"{txt_attr}_start_char_idx", f"{txt_attr}_end_char_idx"])], axis=1
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(lambda r: r[txt_attr][r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"] + 1], axis=1),
            )

        return batch_df


Starting with version 2.0, `BLURR` provides a sequence classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets.

This class can be used for preprocessing both multiclass and multilabel classification datasets, and includes a `proc_{your_text_attr}` and `proc_{your_text_pair_attr}` (optional) attributes containing your modified text as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

**Note**: This class works for both slow and fast tokenizers

#### Using a `DataFrame`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels, tok_kwargs={"max_length": 24})
proc_df = preprocessor.process_df(imdb_df)
proc_df.columns, len(proc_df)
proc_df.head(2)


  final_df = final_df.append(self._process_df_batch(batch_df))
  final_df = final_df.append(self._process_df_batch(batch_df))


Unnamed: 0,proc_text,text,label,is_valid,label_name,text_start_char_idx,text_end_char_idx
0,"I guess when ""Beat Street"" made a national appearance, ""Flashdance"" came at the same time.","I guess when ""Beat Street"" made a national appearance, ""Flashdance"" came at the same time. The problem with ""Flashdance"" is that there was only one break dancing scene and the rest was jazz dance and ballet. That was one of the reasons why ""Beat Street"" was better. The only movie that could rival ""Beat Street"" seems to be ""Footloose"", because both movies focused on how dance had been used by people to express their utmost feelings.<br /><br />The break-dance scenes in ""Beat Street"" come just before the middle and at the end of the flick. And I loved all of them. Almost all of the break tri...",1,False,pos,0,89
1,I was lucky enough to watch this without any pre viewing hype. I was surprised at the resilience of the ghost',"I was lucky enough to watch this without any pre viewing hype. I was surprised at the resilience of the ghost's image in my mind the next day, and the day after that. I've watched it 3-4 times, and each time I appreciate it even more. The settings are gorgeous, the town at dusk has beautiful lighting effects, the marsh long shots, and the house itself is sufficiently grown with moss. The main hero is so likable and good natured, that he is easily sympathized with. To the person who complained that there wasn't enough 'spark' in this film, I'd say that it's because the whole fight against t...",1,False,pos,0,109


#### Using a Hugging Face `Dataset`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


  final_df = final_df.append(self._process_df_batch(batch_df))
  final_df = final_df.append(self._process_df_batch(batch_df))


Dataset({
    features: ['proc_text', 'text', 'label', 'is_valid', 'label_name', 'text_start_char_idx', 'text_end_char_idx'],
    num_rows: 1200
})

## Mid-level API

Base tokenization, batch transform, and DataBlock methods

### `TextInput` -

In [None]:
#|export
class TextInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    pass


A `TextInput` object is returned from the decodes method of `BatchDecodeTransform` as a means to customize `@typedispatch`ed functions like `DataLoaders.show_batch` and `Learner.show_results`. The value will the your "input_ids".

### `BatchTokenizeTransform` -

In [None]:
#|export
class BatchTokenizeTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):
        store_attr()
        self.kwargs = kwargs

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        is_dict = isinstance(samples[0][0], dict)
        test_inp = samples[0][0]["text"] if is_dict else samples[0][0]

        if is_listy(test_inp) and not self.is_split_into_words:
            if is_dict:
                inps = [(item["text"][0], item["text"][1]) for item in samples.itemgot(0).items]
            else:
                inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = [item["text"] for item in samples.itemgot(0).items] if is_dict else samples.itemgot(0).items

        inputs = self.hf_tokenizer(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        d_keys = inputs.keys()

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...), as well as extra information
        # if the inputs is a dictionary.
        # (< 2.0.0): updated_samples = [(*[{k: inputs[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]
        updated_samples = []
        for idx, sample in enumerate(samples):
            inps = {k: inputs[k][idx] for k in d_keys}
            if is_dict:
                inps = {**inps, **{k: v for k, v in sample[0].items() if k not in ["text"]}}

            trgs = sample[1:]
            if self.include_labels and len(trgs) > 0:
                inps["labels"] = trgs[0]

            updated_samples.append((*[inps], *trgs))

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples


Inspired by this [article](https://docs.fast.ai/tutorial.transformers.html), `BatchTokenizeTransform` inputs can come in as raw **text**, **a list of words** (e.g., tasks like Named Entity Recognition (NER), where you want to predict the label of each token), or as a **dictionary** that includes extra information you want to use during post-processing.

**On-the-fly Batch-Time Tokenization**: 

Part of the inspiration for this derives from the mechanics of Hugging Face tokenizers, in particular it can return a collated mini-batch of data given a list of sequences. As such, the collating required for our inputs can be done during tokenization ***before*** our batch transforms run in a `before_batch_tfms` transform (where we get a list of examples)! This allows users of BLURR to have everything done dynamically at batch-time without prior preprocessing with at least four potential benefits:
1. Less code
2. Faster mini-batch creation
3. Less RAM utilization and time spent tokenizing beforehand (this really helps with very large datasets)
4. Flexibility

### `BatchDecodeTransform` -

In [None]:
#|export
class BatchDecodeTransform(Transform):
    """A class used to cast your inputs as `input_return_type` for fastai `show` methods"""

    def __init__(
        self,
        # Used by typedispatched show methods
        input_return_type: Type = TextInput,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: Optional[str] = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: Optional[PretrainedConfig] = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: Optional[PreTrainedModel] = None,
        # Any other keyword arguments
        **kwargs
    ):
        store_attr()
        self.kwargs = kwargs

    def decodes(self, items: dict):
        """Returns the proper object and data for show related fastai methods"""
        return self.input_return_type(items["input_ids"])


As of fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here, `BatchDecodeTransform`, (one that occurs "after" the batch has been created) that will do the decoding for us.

### `TextBlock` -

In [None]:
#|export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    txt = example[0]["text"] if isinstance(example[0], dict) else example[0]
    return len(txt) if is_split_into_words else len(hf_tokenizer.tokenize(txt, **tok_kwargs))


In [None]:
show_doc(blurr_sort_func, title_level=3)


---

#### blurr_sort_func

>      blurr_sort_func (example, hf_tokenizer:transformers.tokenization_utils_ba
>                       se.PreTrainedTokenizerBase,
>                       is_split_into_words:bool=False, tok_kwargs:dict={})

This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| example |  |  |  |
| hf_tokenizer | PreTrainedTokenizerBase |  | A Hugging Face tokenizer |
| is_split_into_words | bool | False | The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
if your inputs are pre-tokenized (not numericalized) |
| tok_kwargs | dict | {} | Any other keyword arguments you want to include during tokenization |

In [None]:
#|export
class TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your inputs for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: Optional[str] = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: Optional[PretrainedConfig] = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: Optional[PreTrainedModel] = None,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before_batch_tfm you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `BatchTokenizeTransform`)
        batch_tokenize_tfm: Optional[BatchTokenizeTransform] = None,
        # The batch_tfm you want to decode your inputs into a type that can be used in the fastai show methods,
        # (defaults to BatchDecodeTransform)
        batch_decode_tfm: Optional[BatchDecodeTransform] = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: Optional[int] = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = TextInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: Optional[DataLoader] = None,
        # Any keyword arguments you want applied to your `batch_tokenize_tfm`
        batch_tokenize_kwargs: dict = {},
        # Any keyword arguments you want applied to your `batch_decode_tfm` (will be set as a fastai `batch_tfms`)
        batch_decode_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and batch_tokenize_tfm is None:
            raise ValueError("You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a BatchTokenizeTransform")

        if batch_tokenize_tfm is None:
            batch_tokenize_tfm = BatchTokenizeTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                include_labels=include_labels,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **batch_tokenize_kwargs.copy()
            )

        if batch_decode_tfm is None:
            batch_decode_tfm = BatchDecodeTransform(input_return_type=input_return_type, **batch_decode_kwargs.copy())

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=batch_tokenize_tfm.hf_tokenizer,
                is_split_into_words=batch_tokenize_tfm.is_split_into_words,
                tok_kwargs=batch_tokenize_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": batch_tokenize_tfm}, batch_tfms=batch_decode_tfm)


A basic `DataBlock` for our inputs, `TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `BatchTokenizeTransform` and `BatchDecodeTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

**Note**: You must either pass in your own instance of a `BatchTokenizeTransform` class or the Hugging Face objects returned from `BLURR.get_hf_objects` (e.g.,architecture, config, tokenizer, and model). The other args are optional.

We also include a `blurr_sort_func` that works with `SortedDL` to properly sort based on the number of tokens in each example.

## Utility classes and methods 

These methods are use internally for getting blurr transforms associated to your `DataLoaders`

In [None]:
#|export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = BatchTokenizeTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm, title_level=3)


---

#### get_blurr_tfm

>      get_blurr_tfm (tfms_list:fastcore.transform.Pipeline, tfm_class:fastcore.
>                     transform.Transform=<class'__main__.BatchTokenizeTransform
>                     '>)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| tfms_list | Pipeline |  | A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...) |
| tfm_class | Transform | BatchTokenizeTransform | The transform to find |

In [None]:
#|export
def first_blurr_tfm(
    # Your fast.ai `DataLoaders
    dls: DataLoaders,
    # The Blurr transforms to look for in order
    tfms: List[Transform] = [BatchTokenizeTransform, BatchDecodeTransform],
):
    """
    This convenience method will find the first Blurr transform required for methods such as
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    for tfm in tfms:
        found_tfm = get_blurr_tfm(dls.before_batch, tfm_class=tfm)
        if found_tfm:
            return found_tfm

        found_tfm = get_blurr_tfm(dls.after_batch, tfm_class=tfm)
        if found_tfm:
            return found_tfm


In [None]:
show_doc(first_blurr_tfm, title_level=3)


---

#### first_blurr_tfm

>      first_blurr_tfm (dls:fastai.data.core.DataLoaders, tfms:List[fastcore.tra
>                       nsform.Transform]=[<class'__main__.BatchTokenizeTransfor
>                       m'>,<class'__main__.BatchDecodeTransform'>])

This convenience method will find the first Blurr transform required for methods such as

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dls | DataLoaders |  | Your fast.ai `DataLoaders |
| tfms | typing.List[fastcore.transform.Transform] | [<class '__main__.BatchTokenizeTransform'>, <class '__main__.BatchDecodeTransform'>] | The Blurr transforms to look for in order |

### `show_batch` -

In [None]:
#|export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `TextInput` typed inputs
    x: TextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["labels"] if ("labels" in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.numpy()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Mid-level Examples

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API.

### Batch-Time Tokenization

#### Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `NLP`.

In [None]:
#| output: false
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(pretrained_model_name, model_cls=model_cls)


loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuratio

####  Step 2: Create your `DataBlock`

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


#### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

In [None]:
b[0]


{'input_ids': tensor([[    0,  8487,     6,  ...,     6,    38,     2],
         [    0, 25143,  3082,  ...,    30,  1599,     2],
         [    0,    38,  1548,  ...,  1206,   324,     2],
         [    0,    38,  1395,  ...,    54, 13119,     2]], device='cuda:1'),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:1'),
 'labels': TensorCategory([0, 0, 1, 0], device='cuda:1')}

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Okay, so I'm not a big video game buff, but was the game House of the Dead really famous enough to make a movie from? Sure, they went as far as to actually put in quick video game clips throughout the movie, as though justifying any particular scene of violence, but there are dozens and dozens of games that look exactly the same, with the hand in the bottom on the screen, supposedly your own, holding whatever weapon and goo-ing all kinds of aliens or walking dead or snipers or whatever the case",neg
1,"MYRA BRECKINRIDGE is one of those rare films that established its place in film history immediately. Praise for the film was absolutely nonexistent, even from the people involved in making it. This film was loathed from day one. While every now and then one will come across some maverick who will praise the film on philosophical grounds (aggressive feminism or the courage to tackle the issue of transgenderism), the film has not developed a cult following like some notorious flops do. It's not h",neg


### Using a preprocessed dataset

Preprocessing your raw data is the more traditional approach to using Transformers. It is required, for example, when you want to work with documents longer than your model will allow. A preprocessed dataset is used in the same way a non-preprocessed dataset is.

#### Step 1a: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(pretrained_model_name, model_cls=model_cls)


loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuratio

#### Step 1b. Preprocess dataset

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


  final_df = final_df.append(self._process_df_batch(batch_df))
  final_df = final_df.append(self._process_df_batch(batch_df))


Dataset({
    features: ['proc_text', 'text', 'label', 'is_valid', 'label_name', 'text_start_char_idx', 'text_end_char_idx'],
    num_rows: 1200
})

#### Step 2: Create your `DataBlock`

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ItemGetter("proc_text"), get_y=ItemGetter("label"), splitter=RandomSplitter())


#### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_ds, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"The 1930s saw a vogue for documentary films about remote corners of the world, with an emphasis on wild animals, exotic terrain and primitive people with unusual cultures. Despite the logistics of transporting a film crew to a distant and dangerous place, and then bringing 'em back alive (with the film footage), such films were often much cheaper to make than were conventional Hollywood features... because there were no expensive sets, costumes, or high-priced movie stars.<br /><br />The most s",neg
1,"I revisited Grand Canyon earlier this year when I set out to devise a ten best list of the 1990's. I first saw the film when I was 17 years old. How did I hear about it? It was reviewed, and recommended highly, by Siskel & Ebert in 1991, and I eventually caught it on video a year later.<br /><br />It's a great film, a powerful film, a healing film, about the power of listening, truly listening to one another. I've seen it six times now, and it entertains and inspires me with every subsequent vi",pos


### Passing extra information

As of v.2, `BLURR` now also allows you to pass extra information alongside your inputs in the form of a dictionary.  If you use this approach, you must assign your text(s) to the `text` attribute of the dictionary.  This is a useful approach when splitting long documents into chunks, but wanting to score/predict by example rather than chunk (for example in extractive question answering tasks).

**Note**: A good place to access to this extra information during training/validation is in the `before_batch` method of a `Callback`.

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)


def get_x(item):
    return {"text": item.text, "another_val": "testing123"}


dblock = DataBlock(blocks=blocks, get_x=get_x, get_y=ColReader("label"), splitter=ColSplitter())


In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Okay, so I'm not a big video game buff, but was the game House of the Dead really famous enough to make a movie from? Sure, they went as far as to actually put in quick video game clips throughout the movie, as though justifying any particular scene of violence, but there are dozens and dozens of games that look exactly the same, with the hand in the bottom on the screen, supposedly your own, holding whatever weapon and goo-ing all kinds of aliens or walking dead or snipers or whatever the case",neg
1,"I approach films about talking animals with care. For every wonderful one like Babe, you get an equally poor one like the dreadful remake of Homeward Bound: The Incredible Journey. Or in the case of Cats & Dogs, you have a great idea for a film not living up to its potential. When I heard about Paulie, the premise of a wisecracking parrot didn't exactly fill me with confidence. But I found the film a pleasant surprise. And it manages to sneak its way into your heart without you realising.<br />",pos


## Low-level API

For working with PyTorch and/or fast.ai Datasets & DataLoaders, the low-level API allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc... when using plain ol' PyTorch Datasets, Hugging Face Datasets, etc...

### `TextBatchCreator` -

In [None]:
#|export
@dataclass
class TextBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        store_attr()
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


### `TextDataLoader` -

In [None]:
#|export
@delegates()
class TextDataLoader(TfmdDL):
    """
    A transformed `DataLoader` that works with Blurr.
    From the fastai docs: A `TfmDL` is described as "a DataLoader that creates Pipeline from a list of Transforms
    for the callbacks `after_item`, `before_batch` and `after_batch`. As a result, it can decode or show a processed batch.
    """

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `BatchTokenizeTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `BatchTokenizeTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `BatchTokenizeTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent (defaults to `BlurrBatchCreator`)
        batch_creator: Optional[TextBatchCreator] = None,
        # The batch_tfm used to decode Blurr batches (defaults to `BatchDecodeTransform`)
        batch_decode_tfm: Optional[BatchDecodeTransform] = None,
        # Used by typedispatched show methods
        input_return_type: Type = TextInput,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_decode_tfm`
        batch_decode_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs,
    ):
        # if the underlying dataset needs to be preprocessed first, apply the preproccesing_func to it
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        # define what happens when a batch is created (e.g., this is where collation happens)
        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = TextBatchCreator(hf_arch, hf_config, hf_tokenizer, hf_model)

        # define the transform applied after the batch is created (used of show methods)
        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_decode_tfm:
            batch_decode_tfm = BatchDecodeTransform(
                input_return_type, hf_arch, hf_config, hf_tokenizer, hf_model, **batch_decode_kwargs.copy()
            )

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_decode_tfm, **kwargs)
        store_attr(names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs,
    ):
        """
        We have to override the new method in order to add back the Hugging Face objects in this factory
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return super().new(dataset, cls, **kwargs)


## Low-level Examples

The following example demonstrates how to use the low-level API with standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/dev/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


Loading cached processed dataset at /home/dev/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6bcdb1a644e8c684.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/dev/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-86ae103db8d3e040.arrow


### Step 2: Dataset pre-processing (optional)

In [None]:
#|export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


In [None]:
show_doc(preproc_hf_dataset, title_level=4)


---

#### preproc_hf_dataset

>      preproc_hf_dataset (dataset:Union[torch.utils.data.dataset.Dataset,fastai
>                          .data.core.Datasets], hf_tokenizer:transformers.token
>                          ization_utils_base.PreTrainedTokenizerBase,
>                          hf_model:transformers.modeling_utils.PreTrainedModel)

This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| dataset | typing.Union[torch.utils.data.dataset.Dataset, fastai.data.core.Datasets] | A standard PyTorch Dataset or fast.ai Datasets |
| hf_tokenizer | PreTrainedTokenizerBase | A Hugging Face tokenizer |
| hf_model | PreTrainedModel | A Hugging Face model |

### Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = TextDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_decode_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = TextDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_decode_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 68])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,DeVries did make one stop in town Wednesday - registering as a sex offender at the Soledad Police Department. Convicted child molester Brian DeVries spoke out in response to community outrage Wednesday afternoon after registering as a sex offender at the Soledad Police Department.,not_equivalent
1,"Already suffering with the nation's worst credit rating, the state is operating for the first time completely on borrowed money. The Davis administration says the state is operating for the first time completely on borrowed money.",not_equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
#|hide
[model_type for model_type in NLP.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'BloomForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'Data2VecAudioForSequenceClassification',
 'Data2VecTextForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 

In [None]:
#|hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
#|hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
#|hide
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/dev/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#|hide
from transformers import RobertaTokenizer

In [None]:
#|hide
model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


loading configuration file https://huggingface.co/hf-internal-testing/tiny-albert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/58e74e0b857853ff5f9ee7a575511635d23da669cfe61bcf3b672a36ea4ab454.9af053cb27b8f4424402f231c8e71b10e864ceed92a8b453a8177062fc021f84
Model config AlbertConfig {
  "_name_or_path": "hf-internal-testing/tiny-albert",
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 64,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 128,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 2,
  "num_hidden_groups": 1,
  "num_hidden_layers": 2,
  "num_memory_blocks": 0,
  "pad_token_id":

=== hf-internal-testing/tiny-albert ===



loading file https://huggingface.co/hf-internal-testing/tiny-albert/resolve/main/spiece.model from cache at /home/dev/.cache/huggingface/transformers/6e205c4ee29c198d999f17a26c33a64233c1c581bcf431b1127d01deb7510780.59ce0212608d6704aca57576f13aa361e91017641acf3e8c617a6e27eb8d2927
loading file https://huggingface.co/hf-internal-testing/tiny-albert/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/90d9ebedf8e582300430763b923aaee1c4777349092e3182d6e7ba05165e1db9.9df60c00984c1dc05cbb833ba48423b71a3639e3b10e2170225f8700b0f5a7c7
loading file https://huggingface.co/hf-internal-testing/tiny-albert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-albert/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/e0f085d5ba6003dd56d84de7485216c332f557b7c0eb99b00ebeddbf6e91c106.15ed5b79b197b4fcc5f3f80b2ee89a5a3ad708dbd076575cd22cffd9e1a56284
loading file https://huggingf

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx(1995) is the developing world's answer to silence of the lambs. where silence' terrorized our peace of mind, citizen' exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals.br /br /citizenx may also argue against(!) the death penalty far better than",1
1,"chris rock deserves better than he gives himself in ""down to earth."" as directed by brothers chris & paul weitz of ""american pie"" fame, this uninspired remake of warren beatty's 1978 fantasy ""heaven can wait,"" itself a rehash of 1941's ""here comes mr. jordan,"" lacks the abrasively profane humor that won chris rock an emmy for his first hbo special. predictably, he spouts swear words from",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-bart/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/d05dbc265bc4edd13898dbd34979f1c8f8bdb813879d888b29613395918f0bd7.94f5f7396aae03ef1dfe169c751fe4cbc3e633b7c0da1447d12708954cc9260e
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
Model config BartConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-bart",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 16,
  "decoder_attention_heads": 4,
  "decoder_ffn_dim": 4,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 2,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 4,
  "encoder_ffn_dim": 4,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 2,
  "eos_token_id": 2,
  "forced_eos_token_id": 2

=== hf-internal-testing/tiny-random-bart ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-bart/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/2ae54909e15516ce24b748c6c697b88ef42930fb1e85714bf11ff092283d0863.c45ad9d7931b838f87a2793ae47dd5fd5edc1a6b0055b898d6e65c3c693ade29
loading file https://huggingface.co/hf-internal-testing/tiny-random-bart/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/e9728b60cbb5003f94bf08d3d0db74db5a7d288a621cdd2257e595711a3ca2cc.0f509d79aaf2540546fff94fa84c1f23aa8e983149cc0dfb587c82d00a3b2497
loading file https://huggingface.co/hf-internal-testing/tiny-random-bart/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/9388e9110cda2fcf2bbbe6a5e39976e14594c47d150b5bb5704c54b02de715f5.31a3adf1b4fbab82d0dce2f9fa890fad77a90174dc28bf62778ed5bdf884c5f4
loading file https://huggingface.co/hf-internal-testing/tiny-random-bart/resolve/main/added_tokens.json from cache at None
loading file https://hug

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become",1
1,"8 Simple Rules for Dating My Teenage Daughter had an auspicious start. The supremely-talented Tom Shadyac was involved in the project. This meant that the comedy would be nothing less of spectacular, and that's exactly what happened: the show remains one of the freshest, funniest, wittiest shows made in a very long time",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-bert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/4ef0a64342c25365033ef9ffe134f930a8f9719be920a63634d932cfa28f7d25.a5a11219cf90aae61ff30e1658ccf2cb4aa84d6b6e947336556f887c9828dc6d
Model config BertConfig {
  "_name_or_path": "hf-internal-testing/tiny-bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Could not locate the tokenizer configuration file, will try to use the model config instead.


=== hf-internal-testing/tiny-bert ===



loading configuration file https://huggingface.co/hf-internal-testing/tiny-bert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/4ef0a64342c25365033ef9ffe134f930a8f9719be920a63634d932cfa28f7d25.a5a11219cf90aae61ff30e1658ccf2cb4aa84d6b6e947336556f887c9828dc6d
Model config BertConfig {
  "_name_or_path": "hf-internal-testing/tiny-bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/hf-internal-testing/tiny-bert/resolve/main/vocab.txt from cache at /home/dev/.cache/hu

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also argue against (! ) the death penalty far better than kevin spacey's the life of david gayle ( 2002 ). < br / > < br / > humans are machiavellian mammals, under",1
1,"8 simple rules for dating my teenage daughter had an auspicious start. the supremely - talented tom shadyac was involved in the project. this meant that the comedy would be nothing less of spectacular, and that's exactly what happened : the show remains one of the freshest, funniest, wittiest shows made in a very long time. every line, facial expression, casting choice, scene, all wreaked of perfection. there was not one episode after which i thought, "" man that wasn't as good as the rest "". each one was a standout. again, this is the",1


loading configuration file https://huggingface.co/google/bigbird-roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/d7643b757353be56f05bdd19496d6e3fb5bb9edfdf5f9e5eca88d6f479e32324.dc98375bb3e19a644a5cadd5c305949ec470186fcc20bd8c8b959a43dcc3ff21
Model config BigBirdConfig {
  "_name_or_path": "google/bigbird-roberta-base",
  "architectures": [
    "BigBirdForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "r

=== google/bigbird-roberta-base ===



loading configuration file https://huggingface.co/google/bigbird-roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/d7643b757353be56f05bdd19496d6e3fb5bb9edfdf5f9e5eca88d6f479e32324.dc98375bb3e19a644a5cadd5c305949ec470186fcc20bd8c8b959a43dcc3ff21
Model config BigBirdConfig {
  "_name_or_path": "google/bigbird-roberta-base",
  "architectures": [
    "BigBirdForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "r

architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic brains",1
1,"8 Simple Rules for Dating My Teenage Daughter had an auspicious start. The supremely-talented Tom Shadyac was involved in the project. This meant that the comedy would be nothing less of spectacular, and that's exactly what happened: the show remains one of the freshest, funniest, wittiest shows made in a very long time. Every line, facial expression, casting choice, scene, all wreaked of perfection. There was not one episode after which I thought, ""Man that wasn't as good as the rest"". Each one was a standout. Again, this is the kind of perfectionism",1


loading configuration file https://huggingface.co/google/bigbird-pegasus-large-arxiv/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/0c5c2a21485ba0e75fd41928cbb901586887479c8fad3f3965b9bcae7632825b.c65855e5554b00a37b55e85d3a9f9dd66ca2c3f276ee79e8daea2165fe581bbf
Model config BigBirdPegasusConfig {
  "_name_or_path": "google/bigbird-pegasus-large-arxiv",
  "activation_dropout": 0.0,
  "activation_function": "gelu_new",
  "architectures": [
    "BigBirdPegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "gradient_checkpointing":

=== google/bigbird-pegasus-large-arxiv ===



loading configuration file https://huggingface.co/google/bigbird-pegasus-large-arxiv/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/0c5c2a21485ba0e75fd41928cbb901586887479c8fad3f3965b9bcae7632825b.c65855e5554b00a37b55e85d3a9f9dd66ca2c3f276ee79e8daea2165fe581bbf
Model config BigBirdPegasusConfig {
  "_name_or_path": "google/bigbird-pegasus-large-arxiv",
  "activation_dropout": 0.0,
  "activation_function": "gelu_new",
  "architectures": [
    "BigBirdPegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "gradient_checkpointing":

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where Silence' terrorized our peace of mind, Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.br />br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).br />br />Humans are Machiavellian mammals, under which lie limbic brains (lizard-logic). Why did",1
1,"Within the realm of Science Fiction, two particular themes consistently elicit interest, were initially explored in the literature of a pre-cinematic era, and have since been periodically revisited by filmmakers and writers alike, with varying degrees of success. The first theme, that of time travel, has held an unwavering fascination for fans of film, as well as the written word, most recently on the screen with yet another version of the H.G. Wells classic, The Time Machine.' The second theme, which also manages to hold audiences in thrall, is that of invisibility, which sparks the imagination with it's seemingly endless",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-ctrl/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/5108b80520fd143e3b01355b64f8dbaa9f74545878c29b7dc535487b560be93e.5bc39104fccb1ee2915d4ff47bf59231f41d7da372b6e0f2e0a6d617d0115a9f
Model config CTRLConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-ctrl",
  "attn_pdrop": 0.1,
  "dff": 8192,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-06,
  "model_type": "ctrl",
  "n_ctx": 512,
  "n_embd": 32,
  "n_head": 4,
  "n_layer": 5,
  "n_positions": 512,
  "pad_token_id": 98,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 246534
}



=== hf-internal-testing/tiny-random-ctrl ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-ctrl/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/6396dd9d415009ddd4888d67b871335174453bdb7fcefb6a2e9b1801a9e53d40.9446203d3fdccf9052d8f3b374e389456bca56e99546a077c7793df9efd43b7c
loading file https://huggingface.co/hf-internal-testing/tiny-random-ctrl/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/2c3686335ac7f0cd2edee08ebdfe2f35e9c42fe7ca65ed6f895ef65fd3e92795.e4a0758785dd792eafcbabf01da7f1e7e83f088336f093f96c9fb57c0507a588
loading file https://huggingface.co/hf-internal-testing/tiny-random-ctrl/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-ctrl/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/a589cbcf0a32229eb0d4b767924a4c3bb2fc2a4e7c97f2bdd88bbb9230e606a4.3766b2880f7f2c1cf6aa2e79dec8b09b08c625bfeb6ae8cfaeaf2f1a1f3ee79d
loading file ht

architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic brains (lizard-logic). Why did two kids, who knew better, stone to death a toddler",1
1,"The majority of Stephen King's short stories are little gems, with original ideas that don't take a long time to develop; basically lean and mean--he sets them up quickly in a scarce number of pages, you read 'em, and you're finished before you know you've begun. They're like the equivalent of a carton of McDonald's fries--they taste Really good and you know there's not much nutritional value in them (re: from a literary standpoint, they don't say much about the universal human condition), but you're still gonna scarf 'em down, just don't be a pig and go for the extra-super-sized portion and fill up on too much grease (""too much grease"" is a metaphor for the prose in",0


loading configuration file https://huggingface.co/camembert-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/f459e43c5ebb871abbf9209195563bff6a11547fd9532047739667c394833221.e23d229c54bcc6f67d337b8b2dd111b0e3dc01fa854bfecd3efdeb8c955749e6
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32005
}

Could not locate the 

=== camembert-base ===



loading configuration file https://huggingface.co/camembert-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/f459e43c5ebb871abbf9209195563bff6a11547fd9532047739667c394833221.e23d229c54bcc6f67d337b8b2dd111b0e3dc01fa854bfecd3efdeb8c955749e6
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32005
}

loading file https://

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where Silence' terrorized our peace of mind, Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also",1
1,"You have to respect this movie. It may be ""just a dumb kid's movie"" but it's the #1 most frequently requested film title in online movie forums, requested by people who remember the story but can't remember the title. Therefore what follows is a much-needed, detailed plot description, since I haven't been able to find such a description anywhere else on the Internet.<br /><br />A typical 2-story",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-canine/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/e2d3a979053c6d984551e0adf9608d704f05998c5714cbde9f9cf77235212dd9.80427d5b9704102be5692d2db6d762f592a571601f7ec30d24087aa0564927f8
Model config CanineConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-canine",
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 57344,
  "downsampling_rate": 4,
  "eos_token_id": 57345,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "intermediate_size": 37,
  "layer_norm_eps": 1e-12,
  "local_transformer_stride": 128,
  "max_position_embeddings": 512,
  "model_type": "canine",
  "num_attention_heads": 4,
  "num_hash_buckets": 16384,
  "num_hash_functions": 8,
  "num_hidden_layers": 5,
  "pad_token_id": 0,
  "transformers_version": "4.20.1",
  "type_vocab_size": 16,
  "upsampling_kernel_size": 4,
  "use_cache": true,

=== hf-internal-testing/tiny-random-canine ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-canine/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-canine/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/7874bde67e82e1457ee0b9622c5c03ab41dee8dda2d7192cfbef8511ed8d4155.ab71f530366fe02e2834427e7b90198bfd0d573bc4279bfafdb2b95fe2b46dde
loading file https://huggingface.co/hf-internal-testing/tiny-random-canine/resolve/main/tokenizer_config.json from cache at /home/dev/.cache/huggingface/transformers/5e81923ebcc227b7d1081886dfde93c2b50118e9eb203e53eaecd3e90cb928b8.7ea4d1bb4eecbb67a9cb64fce7c6d3262759915c4f3dd93eef8d89df46d6656d
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
U

architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citize",1
1,"""It's like hard to like describe just how like exciting it is like to make a relationship like drama like with all the like po",0


loading configuration file https://huggingface.co/YituTech/conv-bert-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/7651fc6ae3906f28c62923bc7c76b0436327540c1ebb62a60b454ec79e102dd1.2a398d65585c12446cf5e632a1839e1754dc16cbbf6b87ccf28ba24c8536394e
Model config ConvBertConfig {
  "_name_or_path": "YituTech/conv-bert-base",
  "architectures": [
    "ConvBertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "conv_kernel_size": 9,
  "embedding_size": 768,
  "eos_token_id": 2,
  "head_ratio": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "convbert",
  "num_attention_heads": 12,
  "num_groups": 1,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

Could not

=== YituTech/conv-bert-base ===



loading configuration file https://huggingface.co/YituTech/conv-bert-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/7651fc6ae3906f28c62923bc7c76b0436327540c1ebb62a60b454ec79e102dd1.2a398d65585c12446cf5e632a1839e1754dc16cbbf6b87ccf28ba24c8536394e
Model config ConvBertConfig {
  "_name_or_path": "YituTech/conv-bert-base",
  "architectures": [
    "ConvBertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "conv_kernel_size": 9,
  "embedding_size": 768,
  "eos_token_id": 2,
  "head_ratio": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "convbert",
  "num_attention_heads": 12,
  "num_groups": 1,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading f

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also argue against (! ) the death penalty far better than kevin spacey's the life of david gayle ( 2002 ). < br / > < br / > humans are machiavellian mammals, under",1
1,"you have to respect this movie. it may be "" just a dumb kid's movie "" but it's the # 1 most frequently requested film title in online movie forums, requested by people who remember the story but can't remember the title. therefore what follows is a much - needed, detailed plot description, since i haven't been able to find such a description anywhere else on the internet. < br / > < br / > a typical 2 - story house is shown in suburbia. 7 - year - old bridget narrates about suspecting something is going on since she and her 11 - year -",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-deberta/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/d90b8881b292ab1d856f82690bfb74616d8feee6aef667f6749ac4095fdbd87d.26341b6108474ff148f1434f7c1a0533463b09f1cc722a5f4b0423afcc0d1af6
Model config DebertaConfig {
  "_name_or_path": "hf-internal-testing/tiny-deberta",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 32,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "intermediate_size": 64,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 128,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pooler_size": 32,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false

=== hf-internal-testing/tiny-deberta ===



loading file https://huggingface.co/hf-internal-testing/tiny-deberta/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/9e9891bc9eff8fce1ef75d86e233fa84f917fee11401a568eb0b813ad3c2ed2e.ceb913d3853aed91e0e70fdddcd6b626583c4691e06f784d358bfb34cbc052f0
loading file https://huggingface.co/hf-internal-testing/tiny-deberta/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/19b5b27ce2721786a27d85d07a84444ade35c288d81668671a8ffb0babc93822.fc6bdd4f1130eb74d9b44c5e17acac631c2cd787f5b17b8644cfdb6f47c6f9b4
loading file https://huggingface.co/hf-internal-testing/tiny-deberta/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/42fa1a1118c2212599265fe2e58cca1727068a57280dc5a220ce8a3eb621bc90.9beb5f062a1f5851b0cf95a3a31ee4842c0b4942ebc12d19cb55b09ed6b97eb4
loading file https://huggingface.co/hf-internal-testing/tiny-deberta/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-i

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where Silence' terrorized our peace of mind, Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equ",1
1,"I felt duty bound to watch the 1983 Timothy Dalton / Zelah Clarke adaptation of ""Jane Eyre,"" because I'd just written an article about the 2006 BBC ""Jane Eyre"" for TheScreamOnline.br /br /So, I approached watching this the way I'd approach doing homework.br /br /I was irritated at first. The l",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-deberta-v2/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/541d892494a26636c1408e4a20cb0b256cece4f0e3963071c797ffff5dead2fc.2903e5d8304dfff195ba79d158504b0e2fdad0e7b95f162303517c2bc8b6193a
Model config DebertaV2Config {
  "_name_or_path": "hf-internal-testing/tiny-random-deberta-v2",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "intermediate_size": 37,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "num_attention_heads": 4,
  "num_hidden_layers": 5,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 32,
  "pos_att_type": [
    "none"
  ],
  "position_biased_input": true,
  "relative_attention": false,
  "transformers_version": "4.20.1",
  "type_vocab_

=== hf-internal-testing/tiny-random-deberta-v2 ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-deberta-v2/resolve/main/spm.model from cache at /home/dev/.cache/huggingface/transformers/d3a1698ff4c94cf60924ad8a43f22ae17f2e892d1a9c3efc217c10f8f937cfe9.f97515c3cb091c0baf4e9ee1964253062c621d2883289e472def9c11d475bd8e
loading file https://huggingface.co/hf-internal-testing/tiny-random-deberta-v2/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-deberta-v2/resolve/main/added_tokens.json from cache at /home/dev/.cache/huggingface/transformers/5b39f326e25439bab07e207c58cbd684e7f7184db1446e509113a90b73d9585f.31e96e41adc634e4e1fc60fd4c62af3893566f8238e340201b943701fcc41497
loading file https://huggingface.co/hf-internal-testing/tiny-random-deberta-v2/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/8ef692e7d3e97f959567960103c0f683844947f3344492a71ab12171a7f8bf6f.f886166424e457f0fc75f92e81205faabe843b2dbbbef6b25f9d8

architecture:	deberta_v2
tokenizer:	DebertaV2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic brains (lizard-logic). Why did two kids, who knew better,",1
1,"He's stocky, sweaty, slightly cross-eyed and restless. He stands in front of us and calls himself a pervert. He claims that we the film viewers perceive the screen as a toilet bowl, and are all secretly wishing for all the s**t to explode from the inside. He's unpredictable and scary. Well? Come on, you could have guessed by now: he's one of the leading philosophers of our age.<br /><br />Slavoj iek is both a narrator and a subject of Sophie Fiennes' extraordinary new film, A Pervert",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-distilbert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/825fc05fc9603996d64ce61190f3b2eadb28e7a2f4db70d2cb44c8b5457deab6.5cd95c0833b01050f80cc06f242975c6b324790c205343941ec863daed8f33c8
Model config DistilBertConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-distilbert",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 32,
  "dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dim": 37,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 4,
  "n_layers": 5,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.20.1",
  "vocab_size": 1124
}



=== hf-internal-testing/tiny-random-distilbert ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-distilbert/resolve/main/vocab.txt from cache at /home/dev/.cache/huggingface/transformers/cb7356c60bdb1cab21d94b9477e70ec540df332cc5e6e6bb48031b57eef4f0b5.c606291473543140252dbd13b15c0a043aff71a9e326df21137eb5be66f05d35
loading file https://huggingface.co/hf-internal-testing/tiny-random-distilbert/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/4dfb06510b8aabc27c695ec83c49c93d00d88ef1ea1058336193e98f9c51a83c.1676f8feaaeeb0fa9bf638daa980e785b6e7d5eae2962691b3ca89571e69434e
loading file https://huggingface.co/hf-internal-testing/tiny-random-distilbert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-distilbert/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/7efbae476db8a6c6fd323850039c973bfb28f8e2949ea6e5eb5b27118fe49a65.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed5

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and sadde",1
1,"i was at first disgusted with director sun - woo jang because i had felt that he cheated me. jang had the potential to create a strong, deeply emotional fi",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-electra/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/2565ece384f57db5fa4a119b223bd9125202ad317452aafc7d65808808aca2cc.90f5fb85fe0424979cbdaf9cfbbdd6961b3beb7fec9dd8beeb61e0850f25178b
Model config ElectraConfig {
  "_name_or_path": "hf-internal-testing/tiny-electra",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 64,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 64,
  "initializer_range": 0.02,
  "intermediate_size": 64,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float16

=== hf-internal-testing/tiny-electra ===



loading file https://huggingface.co/hf-internal-testing/tiny-electra/resolve/main/vocab.txt from cache at /home/dev/.cache/huggingface/transformers/2dfcd50bd9ce17da99dac8d72b04e96e7d21c47c79e6f101439a2d966f21157d.68a48e717528e12ec2e7adfa6bf6d28826aa788a839d25a03361966e97f4aebc
loading file https://huggingface.co/hf-internal-testing/tiny-electra/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/9a7484c23278ab95d7f82ed7a0a90fe2f309a74e0ac4579d6bfcfe51aa6d532c.4f97d4959694d14f04b59e78ba08725b7a9fb43321115cc242c4395a96f89af6
loading file https://huggingface.co/hf-internal-testing/tiny-electra/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-electra/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/1a1dd252117227d357ba22bef0724029f1c6818c7d7a3235fdc9a882204171a2.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://hugging

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also argue against (! ) the",1
1,"you have to respect this movie. it may be "" just a dumb kid's movie "" but it's the # 1 most frequently requested film title in online movie forums, requested by people who remember the story but can't remember the title. therefore what follows is a much - needed, detailed plot description, since i haven't been able to find such a description anywhere else on the internet. < br / > < br / > a typical 2",0


loading configuration file https://huggingface.co/google/fnet-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/c5d03090bf0732b6b4184a02a7c5b75b2d422fd6adbbde4637c8167a592589e0.ffa423946401d97c1a1e1b769e1ecb93e4c7b16573df3eff9c641a7ed95133a8
Model config FNetConfig {
  "_name_or_path": "google/fnet-base",
  "actual_seq_length": 512,
  "architectures": [
    "FNetForPreTraining"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "fnet",
  "num_hidden_layers": 12,
  "pad_token_id": 3,
  "torch_dtype": "float32",
  "tpu_short_seq_length": 512,
  "transformers_version": "4.20.1",
  "type_vocab_size": 4,
  "use_fft": true,
  "use_latest": false,
  "use_tpu_fourier_optimizations": false,
  "vocab_size": 32000
}



=== google/fnet-base ===



loading file https://huggingface.co/google/fnet-base/resolve/main/spiece.model from cache at /home/dev/.cache/huggingface/transformers/a7dcc5f9d312fb571fbcdc57a0deb1b01c469fc268d77b52f59328da3c361345.8369e3dce319313b58980019b27a823776dc36c2870dff154e0e207e4e2da50e
loading file https://huggingface.co/google/fnet-base/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/000083fddb46c8c0f78a7d30de170c4602ffdf9b135302f5a10a7942935c026a.ea39e895a4484b873e2d48be9b5f74b1b57b35a0b82baacc189ebc4ccb313f72
loading file https://huggingface.co/google/fnet-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/google/fnet-base/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/f87f635dbe5f16cc29521ebe2f76f7f143db03ea08f0ab7b6cd5d648dea10c82.0b14ebfed591df99cabf37e503a2d455ad6e67f46730feb8ba9e5683772872b5
loading file https://huggingface.co/google/fnet-base/resolve/main/tokenizer_config.json f

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<",1
1,"Chris Rock deserves better than he gives himself in ""Down To Earth."" As directed by brothers Chris & Paul Weitz of ""American Pie"" fame, this uninspired remake of Warren Beatty's 1978 fantasy ""Heaven Can Wait,"" itself a rehash of 1941's ""Here Comes Mr. Jordan,"" lacks the abrasively profane humor that won Chris Rock an Emmy for his first HBO special. Predictably, he spouts swear words from A to Z, but he consciously avoids the F-word. Anybody",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-flaubert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/6b234456e0e642cf79fd4c5c6866c64a0932e5c094abe1407d1392bf2dfd4ae2.2a89755cf12744339c4a53d2d98902798870556733fd2dbd3256c22137c2be23
Model config FlaubertConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-flaubert",
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 32,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "gelu_activation": true,
  "init_std": 0.02,
  "initializer_range": 0.02,
  "is_encoder": true,
  "lang_id": 0,
  "layer_norm_eps": 1e-12,
  "layerdrop": 0.0,
  "mask_index": 5,
  "mask_token_id": 0,
  "max_position_embeddings": 512,
  "model_type": "flaubert",
  "n_heads": 4,
  "n_langs": 2,
  "n_layers": 5,
  "n_special": 0,
  "pad_index": 2,
  "pad_token_id": 2,
  "pre_norm": false,


=== hf-internal-testing/tiny-random-flaubert ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-flaubert/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/00fef64f0042385e6d4b33e425c81af4a7e9074ace835cf7fb6c0b6a43a13e7d.96f350d77dfb312f78c88393edcf97dfe20d317cd2b697f959ab0547af3b91c5
loading file https://huggingface.co/hf-internal-testing/tiny-random-flaubert/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/fb50bf1c835d440510c62e606156397af0aa74b3796504499d2040f2072abfe7.9a950d10c5797f9bd7ddace781fb7c0139b42c9df4c86b319b0bdcd7f701488e
loading file https://huggingface.co/hf-internal-testing/tiny-random-flaubert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-flaubert/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/aa041369a61e4ecb08169cf15dd3c63a06f3b53125ee14d9b2b4d13a6d4c2bfb.ca56ffb79f370a3d33e0dd7ed83b1de4a5e5187c8dcb3d8b8b4a945738fb7819

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX ( 1995 ) is the developing world' s answer to Silence of the Lambs. Where'Silence'terrorized our peace of mind,'Citizen'exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals. < br / > < br / > CitizenX may also argue against (! ) the death penalty far better than Kevin Spacey' s",1
1,"I was at first disgusted with director Sun-Woo Jang because I had felt that he cheated me. Jang had the potential to create a strong, deeply emotional film about sex and its effects on people, but instead chose to focus his strength on the pornography element more than the actual human element. I couldn' t see the characters at first and his sloppy introduction which blended both realism and cinema together was amateurish at best yet this film remained",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-funnel/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/fdf233c855c919615eea4564486f2b5003177a93cb9b11ac383c3f48b4065965.893bc8a50891863b59f64a57083a2e8cb301323a754564a40f3861bab0145211
Model config FunnelConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-funnel",
  "activation_dropout": 0.0,
  "attention_dropout": 0.1,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    1,
    1
  ],
  "block_sizes": [
    1,
    1,
    2
  ],
  "d_head": 8,
  "d_inner": 37,
  "d_model": 32,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.1,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 4,
  "num_decoder_layers": 1,
  "pool_q_only": true,
  "pooling_type": "mean",
  "separate_cls": true,
  "transformers_version": "4.20.1",
  "truncate_seq": true,
  "type

=== hf-internal-testing/tiny-random-funnel ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-funnel/resolve/main/vocab.txt from cache at /home/dev/.cache/huggingface/transformers/247d9cc22d44243b642d59d7d75db2416e5ef2a2d6588e6b7c599ce85e66aed8.1fdd55620a8938abd0aae89c089853d3d9339a94f0c82b8c135b3e8aaf47d8ef
loading file https://huggingface.co/hf-internal-testing/tiny-random-funnel/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/19a27fbdf5c15ac380d0bd5695ff6f84887d1451d0e6b17fc77bbaa60686de1f.405a128004d59f90f85ac4fb1cb1390c419016a8d20fa210c302d86478bfff60
loading file https://huggingface.co/hf-internal-testing/tiny-random-funnel/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-funnel/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/3fcafef5b0323d0ede2b3876d5e61ab4bccaaa2fe2d3d4dd62bd15562cb8dd3f.34a22f495fc6b4fddbf5d6b2c62637ae42a7204b6355bbd999c44fee4001336d
load

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and sadde",1
1,"the majority of stephen king's short stories are little gems, with original ideas that don't take a long time to develop ; basically lean and mean - - he s",0


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/e54a39799ae101f61741e1df6b7eeba26bcd03bea449be2d459c4e8a2966829b.7561c647f82885e386e46900764dd6b21825061b5c56c3614ecd874113243004
Model config GPT2Config {
  "_name_or_path": "hf-internal-testing/tiny-random-gpt2",
  "activation_function": "gelu_new",
  "attention_probs_dropout_prob": 0.1,
  "attn_pdrop": 0.1,
  "bos_token_id": 98,
  "embd_pdrop": 0.1,
  "eos_token_id": 98,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "initializer_range": 0.02,
  "intermediate_size": 37,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 512,
  "n_embd": 32,
  "n_head": 4,
  "n_inner": null,
  "n_layer": 5,
  "n_positions": 512,
  "pad_token_id": 98,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": tru

=== hf-internal-testing/tiny-random-gpt2 ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/b8b6c0058f45cbbd8912517eec3805150dc3c372b3bcb65e71e453e433d51b2b.80bde3fb870e0b1632f2c438dd0e0577f17c033d37c700f7395952539ca37c60
loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/8eb9d0d2d37f1583d9d9a21417f73ca4f6bd25be0482d68ab9a3ebcf1c33ed45.94b5692cbb69a8f1a0c0f4b8fa214fbd2ae58380e61dbcb8717243807531390b
loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/3281c60ddf902d7c6513f4fb220a1a0f41279287801fbb60f364545cb130832c.a326a1f04fc818beeef2fc3d92c43c42db411fe2d07c53a3cb65430eff1a7c4f
loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/added_tokens.json from cache at None
loading file https://hug

architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equ",1
1,"It all starts with a suicide. Or is it a car crash? I guess it all depends on whether you choose to start at the beginning or the end. Director Gabriele Muccino gives you the ability to enter his new film Seven Pounds whichever way you prefer as he starts at the end and works his way back to the beginning, showing us the course of ev",1


loading configuration file https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/1acd57a0409bc29eae1668be6294f24499c3f10a6f87aff688dd924614543989.502e1005452c9e71c3cddf2e4d6156cda93856c20a73cd64ecafa4d78bf5f9a2
Model config GPTJConfig {
  "_name_or_path": "anton-l/gpt-j-tiny-random",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTJForCausalLM"
  ],
  "attn_pdrop": 0.0,
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gptj",
  "n_ctx": 2048,
  "n_embd": 512,
  "n_head": 4,
  "n_inner": null,
  "n_layer": 8,
  "n_positions": 2048,
  "resid_pdrop": 0.0,
  "rotary_dim": 64,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_s

=== anton-l/gpt-j-tiny-random ===



loading file https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/d6a58e5957db481459fe5f4c20cb6f5cc7b904bb937f9fcf06d7c9e8201d076c.a1b97b074a5ac71fad0544c8abc1b3581803d73832476184bde6cff06a67b6bb
loading file https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/2f8a468408a0914873ad860a73e7987e44a08044672710e460c5691e44ac230b.f5b91da9e34259b8f4d88dbc97c740667a0e8430b96314460cdb04e86d4fc435
loading file https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/41bf7e0b2b68da898e4f29af68b1b0e26092c08119f0812a057845336f67f779.3d0a96d28f7abb81fd01697a611f67edbcc1a0ab0522c492c42e9048a927b40f
loading file https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/added_tokens.json from cache at /home/dev/.cache/huggingface/transformers/3e432440a0043910e1123d2edbcf931

architecture:	gptj
tokenizer:	GPT2TokenizerFast



Token indices sequence length is longer than the specified maximum sequence length for this model (1353 > 1024). Running this sequence through the model will result in indexing errors


*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic brains (",1
1,"It all starts with a suicide. Or is it a car crash? I guess it all depends on whether you choose to start at the beginning or the end. Director Gabriele Muccino gives you the ability to enter his new film Seven Pounds whichever way you prefer as he starts at the end and works his way back to the beginning, showing us the course of events that led us to that heartbreaking 911 call. This is one powerful movie; maybe that is because I'm a softy when it comes to dramas of this ilk, dripping with weighty moments and chock full of devastating performances, but either way, a",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-gpt_neo/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/bf961325156a76e2f32c49873244bc41a042a0ccc0129055d47bca8bb62ecf5f.4fcbfa8ab40a34d125f9f15f7c3610ecddc69cd707c9d26718bfa7efc52378fa
Model config GPTNeoConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-gpt_neo",
  "activation_function": "gelu_new",
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      2
    ]
  ],
  "bos_token_id": 98,
  "embed_dropout": 0.0,
  "eos_token_id": 98,
  "gradient_checkpointing": false,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 512,
  "model_type": "gpt_neo",
  "num_heads": 4,
  "num_layers": 4,
  "pad_token_id": 98,
  "resid_dropout": 0.0,
  "sum

=== hf-internal-testing/tiny-random-gpt_neo ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt_neo/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/88e6e05b0878e807f336c523b262f4f57df6502ba03d954f3ee06b965e07802f.80bde3fb870e0b1632f2c438dd0e0577f17c033d37c700f7395952539ca37c60
loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt_neo/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/694d878cb33288a2555dbfe9e5f291d43e654d2f8ce86a7511518df04a7e27a8.94b5692cbb69a8f1a0c0f4b8fa214fbd2ae58380e61dbcb8717243807531390b
loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt_neo/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/3fed0181f37696a24107fbaf750cb9ad524e5dfdbadaafa6ef3999aab5322a12.a326a1f04fc818beeef2fc3d92c43c42db411fe2d07c53a3cb65430eff1a7c4f
loading file https://huggingface.co/hf-internal-testing/tiny-random-gpt_neo/resolve/main/added_tokens.json from cache at None
loading file

architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equ",1
1,"You have to respect this movie. It may be ""just a dumb kid's movie"" but it's the #1 most frequently requested film title in online movie forums, requested by people who remember the story but can't remember the title. Therefore what follows is a much-needed, detailed plot description, since I haven't been able to find such",0


loading configuration file https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/cfb510f67e8b7caa315edb63cf273dcabea566cc7c79256c9279b9aabfabc1e2.6e328a8b48a360bcdc4fa4628970901425656415d87641bc286c517e3f274c05
Model config IBertConfig {
  "_name_or_path": "kssteven/ibert-roberta-base",
  "architectures": [
    "IBertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "force_dequant": "none",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "ibert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "quant_mode": false,
  "tokenizer_class": "RobertaTokenizer",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "vocab_size":

=== kssteven/ibert-roberta-base ===



loading file https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/673a127a1efd88da2f306da00117064b72a5cc86bcca7be220d81c9c74369858.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/5422ece434498216e797c6ef1ecef875e497cb88774ffb57341843b89567fa70.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/c2fbe5b3fb8f721fc3f4d0e8e2b8d2ed77db69bef5aaf6779bbee917930b1c95.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0
loading file https://huggingface.co/kssteven/ibert-

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic brains",1
1,"8 Simple Rules for Dating My Teenage Daughter had an auspicious start. The supremely-talented Tom Shadyac was involved in the project. This meant that the comedy would be nothing less of spectacular, and that's exactly what happened: the show remains one of the freshest, funniest, wittiest shows made in a very long time. Every line, facial expression, casting choice, scene, all wreaked of perfection. There was not one episode after which I thought, ""Man that wasn't as good as the rest"". Each one was a standout. Again, this is the kind of perfectionism",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-led/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/e171c86ab27ab74c8a41930dc7c8db43032c0e11ffecbd2d39cc30f0463da471.3f37e6d260352c00a840364dc5458bec8e32025edc714e6a579f63753b03f051
Model config LEDConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-led",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.1,
  "attention_window": 4,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 16,
  "decoder_attention_heads": 4,
  "decoder_ffn_dim": 4,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 2,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 4,
  "encoder_ffn_dim": 4,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 2,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_decoder_position_embeddings": 1024,
  "max_encoder_position_em

=== hf-internal-testing/tiny-random-led ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-led/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/b96100da8f192372e452a0a2a28ba58786a628ac570ffe7ed47fb5be0110893c.c45ad9d7931b838f87a2793ae47dd5fd5edc1a6b0055b898d6e65c3c693ade29
loading file https://huggingface.co/hf-internal-testing/tiny-random-led/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/e737c0fd0559f2048e8ee281c4b0ce447c4d643253d228ab92e4c0bfda7d8087.0f509d79aaf2540546fff94fa84c1f23aa8e983149cc0dfb587c82d00a3b2497
loading file https://huggingface.co/hf-internal-testing/tiny-random-led/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/35e261f171e347450705f7724f6dc4c3d2f6020fdd89918be457de08cc7b4f18.6df0972d7138f4fc432df7e74a3d8cb7db67aa4401bd58b6a7598b7707692475
loading file https://huggingface.co/hf-internal-testing/tiny-random-led/resolve/main/added_tokens.json from cache at None
loading file https://hugging

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become",1
1,"I felt duty bound to watch the 1983 Timothy Dalton / Zelah Clarke adaptation of ""Jane Eyre,"" because I'd just written an article about the 2006 BBC ""Jane Eyre"" for TheScreamOnline.<br /><br />So, I approached watching this the way I'd approach doing homework.<br /><",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-longformer/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/81708081a8082a3dee0e4ff2a0dcc8a8a6ad3bf4f9aff009a805b9d7a478fa31.da7d874b35acfd69e29620b3f87f366a726067f620b17c894607146de775fef1
Model config LongformerConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 4,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 32,
  "initializer_range": 0.02,
  "intermediate_size": 37,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 4,
  "num_hidden_layers": 5,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.20.1",
  "type_vocab_size": 16,
  "use_c

=== hf-internal-testing/tiny-random-longformer ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-longformer/resolve/main/vocab.json from cache at /home/dev/.cache/huggingface/transformers/eed36a714785cf42451a72bf4f9b77fd87742d9dc1311da0530a6cd557e79001.c45ad9d7931b838f87a2793ae47dd5fd5edc1a6b0055b898d6e65c3c693ade29
loading file https://huggingface.co/hf-internal-testing/tiny-random-longformer/resolve/main/merges.txt from cache at /home/dev/.cache/huggingface/transformers/f451b0dc9ba9bda1973da374aea24045e0a188e082a669014b22c4b8c80ac4f7.0f509d79aaf2540546fff94fa84c1f23aa8e983149cc0dfb587c82d00a3b2497
loading file https://huggingface.co/hf-internal-testing/tiny-random-longformer/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/024232e00fe3fdbdd0e663a21bea15c890159cd38ef7197ba16bd5cd0fd9aa2e.31a3adf1b4fbab82d0dce2f9fa890fad77a90174dc28bf62778ed5bdf884c5f4
loading file https://huggingface.co/hf-internal-testing/tiny-random-longformer/resolve/main/added_tokens.json from cache at None


architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become",1
1,"8 Simple Rules for Dating My Teenage Daughter had an auspicious start. The supremely-talented Tom Shadyac was involved in the project. This meant that the comedy would be nothing less of spectacular, and that's exactly what happened: the show remains one of the freshest, funniest, wittiest shows made in a very long time",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-mbart/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/922349f7fa5af9c9cf0c8e4ee912664a0b36c79752350f7e56aeb29909ded451.c3b3a430fd159282ee6be3a2b33ba09820c3dd59903a75b75fe40083b6958bfe
Model config MBartConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-mbart",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 16,
  "decoder_attention_heads": 4,
  "decoder_ffn_dim": 4,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 4,
  "encoder_ffn_dim": 4,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 2,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_position_embeddings": 100,
  "model_type": "mbart",
  "num_hidden_layers": 2,
  "pad

=== hf-internal-testing/tiny-random-mbart ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-mbart/resolve/main/sentencepiece.bpe.model from cache at /home/dev/.cache/huggingface/transformers/ee84fa04b4ac155fc71a0b3636feffa9a4995e7348e7e18bac6e6f776887f261.23157302a65b38857b36a1190b1e0d6130eab98c0885f45aed5df1e5d6d906f9
loading file https://huggingface.co/hf-internal-testing/tiny-random-mbart/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/21bc0b1cf71ea2d003cc0a558ac33daf1e7dbd75ea875ee67dad5980f257bb43.05b321ed794244ec380dcc72e05f12fd3383edbf9f07ef7b4de70e7707fea72a
loading file https://huggingface.co/hf-internal-testing/tiny-random-mbart/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-mbart/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/2384606ae1ffb46dba9a6d83357724a2179b0d710e09110919955e708209712d.e30c9fb7d9f6e7b1c3279df9995750097a8135dfe5aa55f9596ce80e7d3

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Citizen(15) is the developing world's answer to Silence of the Lambs. Where Silence' terrorized our peace of mind, Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between",1
1,"The release of TARAN THE APE MAN, in 12, caused a sensation. It may be hard to believe, 0 years later, but the film had much of the same kind of impact as THE MATRI, or THE LORD OF THE RINGS has achieved, at a time when movies and radio were the major so",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-mpnet/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/d54e5bf3e86b76e9c28657d3c43bb028d1e9a97570cb75463e6a9be832581ea8.cb5cbdb7fbab594ea91faa392666d1f7d58705fa38a497c66e37af7dd3d9ac50
Model config MPNetConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-mpnet",
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 64,
  "initializer_range": 0.02,
  "intermediate_size": 64,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mpnet",
  "num_attention_heads": 4,
  "num_hidden_layers": 5,
  "pad_token_id": 1,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.20.1",
  "vocab_size": 1125
}



=== hf-internal-testing/tiny-random-mpnet ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-mpnet/resolve/main/vocab.txt from cache at /home/dev/.cache/huggingface/transformers/38f9d83ccba3884762e6934a5e9c3b76405143ab42f120d36cf25af42c9b587f.02b166fb3942e04a8c97454890d839105fe103e72d8fc23d1f57ac37480589b4
loading file https://huggingface.co/hf-internal-testing/tiny-random-mpnet/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/653e07e84a4df690308acec39f9fe2b24e2bc5fcfe92c16f4f007286ee743e1a.b772630b3633429246cd4773a61a0f8b3bca7b15fa9b5375d28bcb243fab889a
loading file https://huggingface.co/hf-internal-testing/tiny-random-mpnet/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-mpnet/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/ce3f963ed6320a016db84f22bedd2ff06a63b5ae816bd6c3b3900cd0a5a47b54.18ebceb237d999d8f1cb15935e35b314f3e73dd6c4f65e119f4790fa226c9236
loading 

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and sadde",1
1,"by 1987 hong kong had given the world such films as sammo hung's ` encounters of the spooky kind'chow yun fat in john woo's iconic ` a better tomorrow ', ` z",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-mobilebert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/7d4862953e5ffac5b9a850accf13bd9f9d14110adabf9a90798f9e8e5aafb9c6.72b4f91db66e830123ce06c4caa311cf151539a2860ac0a9f3f53ede5404cf0d
Model config MobileBertConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-mobilebert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": true,
  "classifier_dropout": null,
  "embedding_size": 32,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 64,
  "initializer_range": 0.02,
  "intermediate_size": 37,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 5,
  "pad_token_id": 0,
  "transformers_version": "4.20.1",
 

=== hf-internal-testing/tiny-random-mobilebert ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-mobilebert/resolve/main/vocab.txt from cache at /home/dev/.cache/huggingface/transformers/9e0862fee2bea252beff5e9b94de8cb1a7dc5d173c7da10ddeb5ea2cfc1051f1.1d55127e14ebde4e25453f50c0f2cc32f52bb8757ce34c4636ac2e4964fffd58
loading file https://huggingface.co/hf-internal-testing/tiny-random-mobilebert/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/2ce2bded53fee5496012defc7b4a28bc9a770ffc9ece071b4e97e642996aaf2d.9948d01bf1c3e913d498d87f5fd019e127e074e8813b7610be5f6c3ca87db501
loading file https://huggingface.co/hf-internal-testing/tiny-random-mobilebert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-mobilebert/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/b6b9a0b695e9ec03596e2e04c05d72b6429693e5615b28ae2e7fce98bfb5806b.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed5

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and sadde",1
1,"chris rock deserves better than he gives himself in "" down to earth. "" as directed by brothers chris & paul weitz of "" american pie "" fame, this uninspired",0


loading configuration file https://huggingface.co/openai-gpt/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/bebb46f5735701bc248ef9faa26f12577944fa7fc8e9be1a774b94d4cb8b79b6.ba6f10a5446f364b92311c09e55e49aa27024a4aeefc1ea50fd733b77bcd997d
Model config OpenAIGPTConfig {
  "_name_or_path": "openai-gpt",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.20.1

=== openai-gpt ===



loading configuration file https://huggingface.co/openai-gpt/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/bebb46f5735701bc248ef9faa26f12577944fa7fc8e9be1a774b94d4cb8b79b6.ba6f10a5446f364b92311c09e55e49aa27024a4aeefc1ea50fd733b77bcd997d
Model config OpenAIGPTConfig {
  "_name_or_path": "openai-gpt",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.20.1

architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast



Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also argue against (! ) the death penalty far better than kevin spacey's the life of david gayle ( 2002 ). < br / > < br / > humans are machiavellian mammals, under which lie limbic",1
1,"i was at first disgusted with director sun - woo jang because i had felt that he cheated me. jang had the potential to create a strong, deeply emotional film about sex and its effects on people, but instead chose to focus his strength on the pornography element more than the actual human element. i couldn't see the characters at first and his sloppy introduction which blended both realism and cinema together was amateurish at best yet this film remained in my mind for days after i viewed it. what stayed with me wasn't the story, it wasn't the characters, nor was it the apparent pornographic nature of",0


loading configuration file https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/fb4dd7d2aa0fd618977b8007a9482d065c0e2a316a29d921b00f8c55973876c3.ef7496bdec4f3ac0f715459076cb51059bdf3c754a951e02c0d43bab967a152f
Model config ReformerConfig {
  "_name_or_path": "google/reformer-crime-and-punishment",
  "architectures": [
    "ReformerModelWithLMHead"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "attn_layers": [
    "local",
    "lsh",
    "local",
    "lsh",
    "local",
    "lsh"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": true,
  "axial_pos_embds_dim": [
    64,
    192
  ],
  "axial_pos_shape": [
    512,
    1024
  ],
  "chunk_size_lm_head": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "feed_forward_size": 512,
  "hash_seed": null,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.05,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate

=== google/reformer-crime-and-punishment ===



loading configuration file https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/fb4dd7d2aa0fd618977b8007a9482d065c0e2a316a29d921b00f8c55973876c3.ef7496bdec4f3ac0f715459076cb51059bdf3c754a951e02c0d43bab967a152f
Model config ReformerConfig {
  "_name_or_path": "google/reformer-crime-and-punishment",
  "architectures": [
    "ReformerModelWithLMHead"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "attn_layers": [
    "local",
    "lsh",
    "local",
    "lsh",
    "local",
    "lsh"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": true,
  "axial_pos_embds_dim": [
    64,
    192
  ],
  "axial_pos_shape": [
    512,
    1024
  ],
  "chunk_size_lm_head": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "feed_forward_size": 512,
  "hash_seed": null,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.05,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate

architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Citizen( is the developing worlds answer to Silence of the Lambs. Where Silence terrorized our peace of mind, Citizen exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, tha",1
1,"By Hong Kong had given the world such films as Sammo Hungs Encounters of the Spooky Kind Chow Yun Fat in ohn Woos iconic A Better Tomorrow, Zu Warriors and the classic Mr ampire. ackie C",1


loading configuration file https://huggingface.co/google/rembert/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/1d4f8b58603e2714a98447b0de319bf42eaada64567b08be0202aa36989ec31f.61fd5e9ebb7b22b0d19c50c73bcf5471e19b1e77d27f4e7d1e62981e9c2f8a06
Model config RemBertConfig {
  "_name_or_path": "google/rembert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 312,
  "classifier_dropout_prob": 0.1,
  "embedding_dropout_prob": 0,
  "embedding_size": 256,
  "eos_token_id": 313,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 1152,
  "initializer_range": 0.02,
  "input_embedding_size": 256,
  "intermediate_size": 4608,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "rembert",
  "num_attention_heads": 18,
  "num_hidden_layers": 32,
  "output_embedding_size": 1664,
  "pad_token_id": 0,
  "tie_word_embeddings": false,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_s

=== google/rembert ===



loading file https://huggingface.co/google/rembert/resolve/main/sentencepiece.model from cache at /home/dev/.cache/huggingface/transformers/7ca0279022753d7b063992b91da7fc84ccc88203cd5595663f84d903f43fe301.bd5b05922d485604855403b3bc218ca924b9543483bd08debfc9bc7d2ffb5d11
loading file https://huggingface.co/google/rembert/resolve/main/tokenizer.json from cache at /home/dev/.cache/huggingface/transformers/aecbfd30ac098cd64a7ef62afbbb949a7bf80cb8804189d74e1c9cc1f252627f.d25565a9dc594a409d4526c522844b49afdb8c27349e04d079968d8a95f61391
loading file https://huggingface.co/google/rembert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/google/rembert/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/d0d5cf448e7367ce69a8cbb48980c788a66b736ec136a0d3061fd26b5c1b25f0.f886166424e457f0fc75f92e81205faabe843b2dbbbef6b25f9d8ec69f64bc7d
loading file https://huggingface.co/google/rembert/resolve/main/tokenizer_config.json from

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic",1
1,"Antonioni, by making this film, had assumed the role of Papa Smurf to all the little long-haired, American, radical student-Smurfs. He had taken them under the guiding protection of his European communist wings, showing appreciation and support for their confused American ways. (These Smurfs are red and wear blue, not the other way around.) The radical Smurfs were happy to get the guidance of a wise old man with gray hair who regularly preys to the God of all long-haired Smurfs, Lenin the Communist - another wise old man whose beard",0


=== junnyu/roformer_chinese_sim_char_ft_small ===



loading configuration file https://huggingface.co/junnyu/roformer_chinese_sim_char_ft_small/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/d2e42d2d9c083050e4ce532ca0cfe4a130976aa9946518bbdcb3b28ef7e150fe.be5a0dc6d5ebeb31f02c23667d0d315be5533770f9211e18d23fc8c74eac5e91
Model config RoFormerConfig {
  "_name_or_path": "junnyu/roformer_chinese_sim_char_ft_small",
  "architectures": [
    "RoFormerForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "embedding_size": 384,
  "eos_token_id": 102,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "is_decoder": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roformer",
  "num_attention_heads": 6,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_activation": "linear",
  "rotary_value": false,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where silence'terrorized our peace of mind, citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also",1
1,"within the realm of science fiction, two particular themes consistently elicit interest, were initially explored in the literature of a pre - cinematic era, and have since been periodically revisited by filmmakers and writers alike, with varying degrees of success. the first theme, that of time travel, has held an unwavering fascination for fans of film, as well as the wri",0


loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, wil

=== roberta-base ===



loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta-base/resol

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mammals, under which lie limbic brains",1
1,"The majority of Stephen King's short stories are little gems, with original ideas that don't take a long time to develop; basically lean and mean--he sets them up quickly in a scarce number of pages, you read 'em, and you're finished before you know you've begun. They're like the equivalent of a carton of McDonald's fries--they taste Really good and you know there's not much nutritional value in them (re: from a literary standpoint, they don't say much about the universal human condition), but you're still gonna scarf 'em down, just don't be a pig and go for the",0


loading configuration file https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/3953e1a6f4509cd931511911edbb30b9651fcc668edd98df1afc2acc42f1c3aa.0f40271b2d963efad02f5495564fb1a105ec6ea0a484cc97c7d6ffb3385ce4b5
Model config SqueezeBertConfig {
  "_name_or_path": "squeezebert/squeezebert-uncased",
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_groups": 4,
  "intermediate_size": 3072,
  "k_groups": 4,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "squeezebert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_groups": 4,
  "pad_token_id": 0,
  "post_attention_groups": 1,
  "q_groups": 4,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "v_groups": 4,
  "vocab_size": 30528
}

Could not locate the tokenizer config

=== squeezebert/squeezebert-uncased ===



loading configuration file https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/3953e1a6f4509cd931511911edbb30b9651fcc668edd98df1afc2acc42f1c3aa.0f40271b2d963efad02f5495564fb1a105ec6ea0a484cc97c7d6ffb3385ce4b5
Model config SqueezeBertConfig {
  "_name_or_path": "squeezebert/squeezebert-uncased",
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_groups": 4,
  "intermediate_size": 3072,
  "k_groups": 4,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "squeezebert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_groups": 4,
  "pad_token_id": 0,
  "post_attention_groups": 1,
  "q_groups": 4,
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "v_groups": 4,
  "vocab_size": 30528
}

loading file https://huggingface.co/s

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where ` silence'terrorized our peace of mind, ` citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also argue against (! ) the death penalty far better than kevin spacey's the life of david gayle ( 2002 ). < br / > < br / > humans are machiavellian mammals, under",1
1,"by 1987 hong kong had given the world such films as sammo hung's ` encounters of the spooky kind'chow yun fat in john woo's iconic ` a better tomorrow ', ` zu warriors'and the classic ` mr vampire '. jackie chan was having international success on video, but it was with ` a chinese ghost story'that hk cinema had its first real crossover theatrical hit in the west for many years. < br / > < br / > western filmgoers had never seen anything like it. it was a film that took various ingredients that hk cinema had used for years ( flying swordsman",1


loading configuration file https://huggingface.co/hf-internal-testing/tiny-random-transfo-xl/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/56c7c46d5e64ca137303d4f67c605e9c918c624fba287ed03583b812793e7206.3dcbe74a6aa772d2b92743c75d5ec6e321150a8760477beb4c906a983fc0554d
Model config TransfoXLConfig {
  "_name_or_path": "hf-internal-testing/tiny-random-transfo-xl",
  "adaptive": true,
  "attn_type": 0,
  "clamp_len": 15,
  "cutoffs": [
    10,
    50,
    80
  ],
  "d_embed": 32,
  "d_head": 8,
  "d_inner": 128,
  "d_model": 32,
  "div_val": 2,
  "dropatt": 0.0,
  "dropout": 0.1,
  "eos_token_id": 0,
  "init": "normal",
  "init_range": 0.01,
  "init_std": 0.02,
  "layer_norm_epsilon": 1e-05,
  "mem_len": 30,
  "model_type": "transfo-xl",
  "n_head": 4,
  "n_layer": 5,
  "pad_token_id": 98,
  "pre_lnorm": false,
  "proj_init_std": 0.01,
  "same_length": true,
  "sample_softmax": -1,
  "tie_projs": [
    false,
    true,
    true,
    true
  ],
  "transfor

=== hf-internal-testing/tiny-random-transfo-xl ===



loading file https://huggingface.co/hf-internal-testing/tiny-random-transfo-xl/resolve/main/vocab.pkl from cache at /home/dev/.cache/huggingface/transformers/292b5a7b077fd9ef8a928d85c8012b568f01b9bbf0ae7926e99630a881c38f91.fdda7f8f560b0223cea8e83236dab189ca8f7822e8f44342d3ca22e0d894dee1
loading file https://huggingface.co/hf-internal-testing/tiny-random-transfo-xl/resolve/main/vocab.bin from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-transfo-xl/resolve/main/vocab.txt from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-transfo-xl/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/hf-internal-testing/tiny-random-transfo-xl/resolve/main/special_tokens_map.json from cache at /home/dev/.cache/huggingface/transformers/f1e14f51f41c9ef29a252f2305ab96489a9a5ee07f0649984449b321aa198753.bc4a156c14335d3e6946d8ab8c583772687dca53c2d0a1595b8325215927a136
loading file https://huggingface.co/hf-

architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(1995) is the developing world's answer to Silence of the Lambs. Where 'Silence' terrorized our peace of mind, 'Citizen' exhausts and saddens us instead. This dramatization of the case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals. < br / > < br / > may also argue against (!) the death penalty far better than Kevin Spacey's The Life of David Gayle (2002). < br / > < br / > Humans are Machiavellian mammals, under which lie limbic brains (lizard-logic). Why did two kids, who knew better, stone to death",1
1,"8 Simple Rules for Dating My Teenage Daughter had an auspicious start. The supremely-talented Tom Shadyac was involved in the project. This meant that the comedy would be nothing less of spectacular, and that's exactly what happened: the show remains one of the freshest, funniest, wittiest shows made in a very long time. Every line, facial expression, casting choice, scene, all wreaked of perfection. There was not one episode after which I thought, ""Man that wasn't as good as the rest."" Each one was a standout. Again, this is the kind of perfectionism that we've come to expect from Tom. For those who don",1


loading configuration file https://huggingface.co/xlm-mlm-en-2048/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/871962fa7b55c55e16a9f67271c0e834c21d65aea30e4891f49854833f9b7ca2.f0e67c8211991e2f6e18881d243fc96e2df93077c1c00806d44bf3375b7947ab
Model config XLMConfig {
  "_name_or_path": "xlm-mlm-en-2048",
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 2048,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "gelu_activation": true,
  "init_std": 0.02,
  "is_encoder": true,
  "lang_id": 0,
  "layer_norm_eps": 1e-12,
  "mask_index": 5,
  "mask_token_id": 0,
  "max_position_embeddings": 512,
  "model_type": "xlm",
  "n_heads": 16,
  "n_langs": 1,
  "n_layers": 12,
  "pad_index": 2,
  "pad_token_id": 2,
  "sinusoidal_embeddings": false,
  "start_n_top": 5,
  "summary_activation": null,
  "summar

=== xlm-mlm-en-2048 ===



loading configuration file https://huggingface.co/xlm-mlm-en-2048/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/871962fa7b55c55e16a9f67271c0e834c21d65aea30e4891f49854833f9b7ca2.f0e67c8211991e2f6e18881d243fc96e2df93077c1c00806d44bf3375b7947ab
Model config XLMConfig {
  "_name_or_path": "xlm-mlm-en-2048",
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 2048,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "gelu_activation": true,
  "init_std": 0.02,
  "is_encoder": true,
  "lang_id": 0,
  "layer_norm_eps": 1e-12,
  "mask_index": 5,
  "mask_token_id": 0,
  "max_position_embeddings": 512,
  "model_type": "xlm",
  "n_heads": 16,
  "n_langs": 1,
  "n_layers": 12,
  "pad_index": 2,
  "pad_token_id": 2,
  "sinusoidal_embeddings": false,
  "start_n_top": 5,
  "summary_activation": null,
  "summar

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"citizenx ( 1995 ) is the developing world's answer to silence of the lambs. where'silence'terrorized our peace of mind,'citizen'exhausts and saddens us instead. this dramatization of the chikatilo case translates rather well, thanks to a westernized friendship between two rostov cops who become equals. < br / > < br / > citizenx may also argue against (! ) the death penalty far better than kevin spacey's the life of david gayle ( 2002 ). < br / > < br / > humans are machiavellian mammals, under which lie",1
1,"the majority of stephen king's short stories are little gems, with original ideas that don 't take a long time to develop ; basically lean and mean--he sets them up quickly in a scarce number of pages, you read'em, and you're finished before you know you've begun. they're like the equivalent of a carton of mcdonald's fries--they taste really good and you know there's not much nutritional value in them ( re : from a literary standpoint, they don 't say much about the universal human condition ), but you're still gonna scarf'em down, just don 't be a",0


loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.dfaaaedc7c1c475302398f09706cbb21e23951b73c6e2b3162c1c8a99bb3b62a
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

Could not lo

=== xlm-roberta-base ===



loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.dfaaaedc7c1c475302398f09706cbb21e23951b73c6e2b3162c1c8a99bb3b62a
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading file

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />Humans are Machiavellian mamma",1
1,"8 Simple Rules for Dating My Teenage Daughter had an auspicious start. The supremely-talented Tom Shadyac was involved in the project. This meant that the comedy would be nothing less of spectacular, and that's exactly what happened: the show remains one of the freshest, funniest, wittiest shows made in a very long time. Every line, facial expression, casting choice, scene, all wreaked of perfection. There was not one episode after which I thought, ""Man that wasn't as good as the rest"". Each one was a",1


loading configuration file https://huggingface.co/xlnet-base-cased/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346
Model config XLNetConfig {
  "_name_or_path": "xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {


=== xlnet-base-cased ===



loading configuration file https://huggingface.co/xlnet-base-cased/resolve/main/config.json from cache at /home/dev/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346
Model config XLNetConfig {
  "_name_or_path": "xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {


architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"CitizenX(1995) is the developing world's answer to Silence of the Lambs. Where `Silence' terrorized our peace of mind, `Citizen' exhausts and saddens us instead. This dramatization of the Chikatilo case translates rather well, thanks to a Westernized friendship between two Rostov cops who become equals.<br /><br />CitizenX may also argue against(!) the death penalty far better than Kevin Spacey's The Life of David Gayle(2002).<br /><br />",1
1,"You have to respect this movie. It may be ""just a dumb kid's movie"" but it's the #1 most frequently requested film title in online movie forums, requested by people who remember the story but can't remember the title. Therefore what follows is a much-needed, detailed plot description, since I haven't been able to find such a description anywhere else on the Internet.<br /><br />A typical 2-story house is shown in suburbia. 7-year-old Bridget narrates about suspecting something is going on since she and her 11",0


In [None]:
#| echo: false
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,google/bigbird-pegasus-large-arxiv,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Export -

The `text.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
#|hide
from nbdev import nbdev_export

nbdev_export()
