In [None]:
%pip install -q transformers pandas

In [29]:
import pandas as pd
from transformers import AutoTokenizer

model_names = [
    'bert-base-cased',
    'distilbert-base-uncased', # lighter version of BERT for fast processing
    'microsoft/deberta-v3-small', # improved over BERT

    'FacebookAI/roberta-base',
    'facebook/bart-base',
    'xlm-roberta-base',
    'allenai/longformer-base-4096',
    'facebook/bart-large',  # summarization tasks
    'roberta-base', # robustly optimized BERT
    'Intel/neural-chat-7b-v3-3', # Trained on Gaudi 2, DPO

    'gpt2',
    'google/flan-t5-base',
    't5-small', # T5 model family, optimized for both transalation and summarization
    'google/pegasus-xsum',
]


# Initialize tokenizers and gather properties
data = []
for model_name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Collect properties for each tokenizer
    data.append({
        'Model': tokenizer.name_or_path,
        'Max Length': tokenizer.model_max_length,
        'Special Tokens': tokenizer.all_special_tokens,
        'Unknown Token': tokenizer.unk_token,
        'Beginning of Sequence Token': tokenizer.bos_token,
        'End of Sequence Token': tokenizer.eos_token,
        'Mask Token': tokenizer.mask_token,
        'Separator Token': tokenizer.sep_token,
        'Class Token': tokenizer.cls_token
    })

df = pd.DataFrame(data)
pd.set_option('display.expand_frame_repr', False)
print(df)



                           Model                       Max Length                                     Special Tokens  Unknown Token Beginning of Sequence Token End of Sequence Token Mask Token Separator Token Class Token
0                bert-base-cased                              512               [[UNK], [SEP], [PAD], [CLS], [MASK]]          [UNK]                        None                  None     [MASK]           [SEP]       [CLS]
1        distilbert-base-uncased                              512               [[UNK], [SEP], [PAD], [CLS], [MASK]]          [UNK]                        None                  None     [MASK]           [SEP]       [CLS]
2     microsoft/deberta-v3-small  1000000000000000019884624838656               [[CLS], [SEP], [UNK], [PAD], [MASK]]          [UNK]                       [CLS]                 [SEP]     [MASK]           [SEP]       [CLS]
3        FacebookAI/roberta-base                              512                  [<s>, </s>, <unk>, <pad>, <mask>]