In [None]:
from functools import partial
import pandas as pd

from transformers import *
from datasets import *

In [None]:
pretrained_model_names_classification = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]

In [None]:
pretrained_model_names_token_classification = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "camembert-base",
    "google/canine-s",  # word_ids
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "microsoft/deberta-v2-xlarge",  # word_ids
    "sshleifer/tiny-distilbert-base-cased",
    "hf-internal-testing/tiny-electra",
    # "google/fnet-base",                               # forward() got an unexpected keyword argument 'output_attentions'
    "flaubert/flaubert_small_cased",  # word_ids
    "huggingface/funnel-small-base",
    "sshleifer/tiny-gpt2",
    "hf-internal-testing/tiny-layoutlm",
    "allenai/longformer-base-4096",
    "microsoft/mpnet-base",
    "kssteven/ibert-roberta-base",
    # "nvidia/megatron-bert-cased-345m",                # could not test
    "google/mobilebert-uncased",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "xlm-mlm-en-2048",  # word_ids
    "xlm-roberta-base",
    "xlnet-base-cased",
]

In [None]:
pretrained_model_names_summarization = [
    "facebook/bart-base",
    "facebook/blenderbot_small-90M",
    "allenai/led-base-16384",
    "google/mt5-small",
    "google/pegasus-cnn_dailymail",
    "t5-small",
    "microsoft/prophetnet-large-uncased",
    "microsoft/xprophetnet-large-wiki100-cased",  # XLMProphetNet
]

pretrained_model_names_translation = [
    "facebook/bart-base",
    "facebook/wmt19-de-en",  # FSMT
    "Helsinki-NLP/opus-mt-de-en",  # MarianMT
    "sshleifer/tiny-mbart",
    "google/mt5-small",
    "t5-small",
]

In [None]:
model_names = sorted(
    list(
        set(
            pretrained_model_names_classification
            + pretrained_model_names_summarization
            + pretrained_model_names_token_classification
            + pretrained_model_names_translation
        )
    )
)

In [None]:
# model_names

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0], raw_datasets[1]])
final_ds = final_ds.add_column("example_id", range(len(final_ds)))

len(final_ds)
final_ds[1]

Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


{'label': 1,
 'text': 'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets 

In [None]:
def preprocess_data(examples, tokenizer):
    encoding = tokenizer(
        examples["text"],
        truncation=False,
        padding=False,
        return_length=True,
        verbose=False,
    )

    n_chars = []
    for i in range(len(encoding["input_ids"])):
        n_chars.append(len(examples["text"][i]))

    encoding["n_chars"] = n_chars
    return {k: v for k, v in encoding.items() if k in ["length", "n_chars", "text"]}

In [None]:
full_df = pd.DataFrame()

for model_name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    tokenizer_name = tokenizer.__class__.__name__
    if len(full_df) > 1 and tokenizer_name in full_df.tokenizer.unique().tolist():
        continue

    tokenized_datasets = final_ds.map(
        partial(preprocess_data, tokenizer=tokenizer), batched=True
    )

    df = pd.DataFrame(tokenized_datasets)
    df["tokenizer"] = tokenizer_name
    df["is_fast"] = tokenizer.is_fast

    full_df = pd.concat([full_df, df])

  0%|          | 0/50 [00:00<?, ?ba/s]

thread '<unnamed>' panicked at 'no entry found for key', /tmp/pip-req-build-no1wrsbk/tokenizers-lib/src/models/mod.rs:36:66
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


  0%|          | 0/50 [00:00<?, ?ba/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/247k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/494 [00:00<?, ?B/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
full_df.head()

Unnamed: 0,example_id,is_valid,label,length,n_chars,text,tokenizer,is_fast
0,0,False,1,290,806,Bromwell High is a cartoon comedy. It ran at t...,MarianTokenizer,False
1,1,False,1,914,2366,Homelessness (or Houselessness as George Carli...,MarianTokenizer,False
2,2,False,1,318,841,Brilliant over-acting by Lesley Ann Warren. Be...,MarianTokenizer,False
3,3,False,1,244,663,This is easily the most underrated film inn th...,MarianTokenizer,False
4,4,False,1,244,647,This is not the typical Mel Brooks film. It wa...,MarianTokenizer,False


In [None]:
full_df.tokenizer.unique().tolist()

['MarianTokenizer',
 'ConvBertTokenizerFast',
 'LEDTokenizerFast',
 'LongformerTokenizerFast',
 'GPT2TokenizerFast',
 'CamembertTokenizerFast',
 'BartTokenizerFast',
 'BlenderbotSmallTokenizer',
 'FSMTTokenizer',
 'FlaubertTokenizer',
 'PegasusTokenizerFast',
 'BigBirdTokenizerFast',
 'CanineTokenizer',
 'FNetTokenizerFast',
 'MobileBertTokenizerFast',
 'T5TokenizerFast',
 'ReformerTokenizerFast',
 'RemBertTokenizerFast',
 'AlbertTokenizerFast',
 'BertTokenizerFast',
 'DebertaTokenizerFast',
 'ElectraTokenizerFast',
 'LayoutLMTokenizerFast',
 'CTRLTokenizer',
 'DebertaV2Tokenizer',
 'DistilBertTokenizerFast',
 'FunnelTokenizerFast',
 'MBartTokenizerFast',
 'MPNetTokenizerFast',
 'TransfoXLTokenizer',
 'RoFormerTokenizerFast',
 'RobertaTokenizerFast',
 'ProphetNetTokenizer',
 'XLMProphetNetTokenizer',
 'OpenAIGPTTokenizerFast',
 'SqueezeBertTokenizerFast',
 'XLMTokenizer',
 'XLMRobertaTokenizerFast',
 'XLNetTokenizerFast']