In [3]:
import os, math, json, random
import torch
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers import BertConfig, BertForSequenceClassification, AutoModel

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x10ab9cf90>

# 1. AutoTokenizer basics (padding/truncation/stride/offsets/sentence pair)

# 1.1 padding/truncation/offsets/tensors

In [4]:
name = "bert-base-uncased"
tok = AutoTokenizer.from_pretrained(name, use_fast=True) #Fast tokenizer (Rust backend)

text = "Hugging Face tokenziers make subwords easy & fast!"
batch = tok(
    text,                           # input: string or list of strings
    padding="max_length",           # pad to max_length
    truncation=True,                # truncate if longer than max_length
    max_length=16,                  # set max_length
    return_offsets_mapping=True,    # return offset mapping
    return_tensors="pt"             # output PyTorch tensors
)

print("\n[1.1] keys:", batch.keys())
print("[1.1] input_ids shape:", batch["input_ids"].shape)             # [B=1, L]
print("[1.1] attention_mask shape:", batch["attention_mask"].shape)   # [1, L]
print("[1.1] offsets shape:", batch["offset_mapping"].shape)          # [1, L, 2]
print("[1.1] tokens:", tok.convert_ids_to_tokens(batch["input_ids"][0]))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


[1.1] keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])
[1.1] input_ids shape: torch.Size([1, 16])
[1.1] attention_mask shape: torch.Size([1, 16])
[1.1] offsets shape: torch.Size([1, 16, 2])
[1.1] tokens: ['[CLS]', 'hugging', 'face', 'token', '##zier', '##s', 'make', 'sub', '##words', 'easy', '&', 'fast', '!', '[SEP]', '[PAD]', '[PAD]']


In [7]:
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [8]:
batch["input_ids"]

tensor([[  101, 17662,  2227, 19204, 21548,  2015,  2191,  4942, 22104,  3733,
          1004,  3435,   999,   102,     0,     0]])

In [10]:
tok.convert_ids_to_tokens(batch["input_ids"][0])

['[CLS]',
 'hugging',
 'face',
 'token',
 '##zier',
 '##s',
 'make',
 'sub',
 '##words',
 'easy',
 '&',
 'fast',
 '!',
 '[SEP]',
 '[PAD]',
 '[PAD]']

In [11]:
batch["token_type_ids"]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
batch["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])

In [13]:
batch["offset_mapping"]

tensor([[[ 0,  0],
         [ 0,  7],
         [ 8, 12],
         [13, 18],
         [18, 22],
         [22, 23],
         [24, 28],
         [29, 32],
         [32, 37],
         [38, 42],
         [43, 44],
         [45, 49],
         [49, 50],
         [ 0,  0],
         [ 0,  0],
         [ 0,  0]]])

In [5]:
# visualize some toekn's coverrage
i = 3
start, end = batch["offset_mapping"][0, i].tolist()
print(f"[1.1] token[{i}] spans original text[{start}:{end}] ->", repr(text[start:end]))

[1.1] token[3] spans original text[13:18] -> 'token'


## 1.2 sentence pair: token_type_ids (segment ids, BERT)

In [6]:
premise = "A cat sits on the mat."
hypo = "An animal is resting indoors."
pair = tok(
    premise,
    hypo,
    padding="longest",
    truncation=True,
    return_token_type_ids=True,  #BERT sentence pair differentiation
    return_tensors="pt"
)
print("\n[1.2] pair keys:", pair.keys())
print("[1.2] input_ids:", pair["input_ids"])
print("[1.2] token_type_ids:", pair["token_type_ids"])  # 0=sentence A, 1=sentence B
print("[1.2] tokens:", tok.convert_ids_to_tokens(pair["input_ids"][0]))


[1.2] pair keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[1.2] input_ids: tensor([[  101,  1037,  4937,  7719,  2006,  1996, 13523,  1012,   102,  2019,
          4111,  2003,  8345, 24274,  1012,   102]])
[1.2] token_type_ids: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])
[1.2] tokens: ['[CLS]', 'a', 'cat', 'sits', 'on', 'the', 'mat', '.', '[SEP]', 'an', 'animal', 'is', 'resting', 'indoors', '.', '[SEP]']


In [14]:
pair["input_ids"]

tensor([[  101,  1037,  4937,  7719,  2006,  1996, 13523,  1012,   102,  2019,
          4111,  2003,  8345, 24274,  1012,   102]])

In [15]:
tok.convert_ids_to_tokens(pair["input_ids"][0])

['[CLS]',
 'a',
 'cat',
 'sits',
 'on',
 'the',
 'mat',
 '.',
 '[SEP]',
 'an',
 'animal',
 'is',
 'resting',
 'indoors',
 '.',
 '[SEP]']

In [16]:
pair["token_type_ids"]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])

In [17]:
pair["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## 1.3 long text sliding window: stride + overflow 

In [18]:
long_text = " ".join(["Transformers are powerful."] * 20)
long_text

'Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful. Transformers are powerful.'

In [20]:
pack = tok(
    long_text,
    truncation=True,
    padding=True,
    max_length=16,
    stride=4,                           # overlap 4 tokens between windown
    return_overflowing_tokens=True,     # return multiple slices
    return_offsets_mapping=True,
    return_tensors="pt"
)

print("\n[1.3] num_overflowing_slices:", len(pack["input_ids"]))      # [N_slices, L]
print("[1.3] shapes: ids", pack["input_ids"].shape, "offsets", pack["offset_mapping"].shape)
print("[1.3] overflow_to_sample_mapping:", pack["overflow_to_sample_mapping"])  # which sample is every slice from


[1.3] num_overflowing_slices: 8
[1.3] shapes: ids torch.Size([8, 16]) offsets torch.Size([8, 16, 2])
[1.3] overflow_to_sample_mapping: tensor([0, 0, 0, 0, 0, 0, 0, 0])


In [21]:
pack.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [22]:
pack["input_ids"]

tensor([[  101, 19081,  2024,  3928,  1012, 19081,  2024,  3928,  1012, 19081,
          2024,  3928,  1012, 19081,  2024,   102],
        [  101,  3928,  1012, 19081,  2024,  3928,  1012, 19081,  2024,  3928,
          1012, 19081,  2024,  3928,  1012,   102],
        [  101, 19081,  2024,  3928,  1012, 19081,  2024,  3928,  1012, 19081,
          2024,  3928,  1012, 19081,  2024,   102],
        [  101,  3928,  1012, 19081,  2024,  3928,  1012, 19081,  2024,  3928,
          1012, 19081,  2024,  3928,  1012,   102],
        [  101, 19081,  2024,  3928,  1012, 19081,  2024,  3928,  1012, 19081,
          2024,  3928,  1012, 19081,  2024,   102],
        [  101,  3928,  1012, 19081,  2024,  3928,  1012, 19081,  2024,  3928,
          1012, 19081,  2024,  3928,  1012,   102],
        [  101, 19081,  2024,  3928,  1012, 19081,  2024,  3928,  1012, 19081,
          2024,  3928,  1012, 19081,  2024,   102],
        [  101,  3928,  1012, 19081,  2024,  3928,  1012, 19081,  2024,  3928,
    

In [24]:
[tok.convert_ids_to_tokens(pack["input_ids"][i]) for i in range(len(pack["input_ids"]))]

[['[CLS]',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  '[SEP]'],
 ['[CLS]',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  '[SEP]'],
 ['[CLS]',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  '[SEP]'],
 ['[CLS]',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  '[SEP]'],
 ['[CLS]',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  'powerful',
  '.',
  'transformers',
  'are',
  '[SEP]'],
 ['[CLS]',
  'powerful',
  '.',
  'transfo

In [25]:
pack['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [26]:
pack['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])

In [27]:
pack['offset_mapping']

tensor([[[  0,   0],
         [  0,  12],
         [ 13,  16],
         [ 17,  25],
         [ 25,  26],
         [ 27,  39],
         [ 40,  43],
         [ 44,  52],
         [ 52,  53],
         [ 54,  66],
         [ 67,  70],
         [ 71,  79],
         [ 79,  80],
         [ 81,  93],
         [ 94,  97],
         [  0,   0]],

        [[  0,   0],
         [ 71,  79],
         [ 79,  80],
         [ 81,  93],
         [ 94,  97],
         [ 98, 106],
         [106, 107],
         [108, 120],
         [121, 124],
         [125, 133],
         [133, 134],
         [135, 147],
         [148, 151],
         [152, 160],
         [160, 161],
         [  0,   0]],

        [[  0,   0],
         [135, 147],
         [148, 151],
         [152, 160],
         [160, 161],
         [162, 174],
         [175, 178],
         [179, 187],
         [187, 188],
         [189, 201],
         [202, 205],
         [206, 214],
         [214, 215],
         [216, 228],
         [229, 232],
         

In [28]:
pack['overflow_to_sample_mapping']

tensor([0, 0, 0, 0, 0, 0, 0, 0])

# 2. Train self-defined BPE, used as fast tokenizer

In [29]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
from tokenizers.normalizers import Sequence as NormSeq, NFKC, Lowercase, Strip
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import BPEDecoder, WordPiece as WordPieceDecoder

In [31]:
corpus = [
    "Transformers encode and decode sequences.",
    "Tokenizers split text into subwords efficiently.",
    "We can train BPE or WordPiece or Unigram models.",
    "Hugging Face provides fast tokenization.",
    "Subword methods handle out-of-vocabulary words."
]

model = BPE(unk_token="[UNK]")
# model = WordPiece(unk_token="[UNK]")
# model = Unigram()

tok_bpe = Tokenizer(model)

# normalizer
tok_bpe.normalizer = NormSeq([NFKC(), Lowercase(), Strip()])

# pretokenizer
tok_bpe.pre_tokenizer = Whitespace()

# special tokens and trainer
specials = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = BpeTrainer(
    vocab_size=2000,
    min_frequency=1,
    special_tokens=specials
)
# trainer = WordPieceTrainer(vocab_size=2000, special_tokens=specials) # if use WordPiece model
# trainer = UnigramTrainer(vocab_size=2000, special_tokens=specials)   # if use Unigram model

# train
tok_bpe.train_from_iterator(corpus, trainer=trainer)

# post-processor (BERT style): [CLS] A [SEP] (optional B) [SEP]
tok_bpe.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B [SEP]",
    special_tokens=[
        ("[CLS]", tok_bpe.token_to_id("[CLS]")),
        ("[SEP]", tok_bpe.token_to_id("[SEP]")),
    ]
)

# decoder
tok_bpe.decoder = BPEDecoder()

# save and load
save_dir = "day040_bpe_tok"
os.makedirs(save_dir, exist_ok=True)
tok_bpe.save(os.path.join(save_dir, "tokenizer.json"))

# Wrap (for transformers)
fast = PreTrainedTokenizerFast(
    tokenizer_object=tok_bpe,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)

# try
texts = ["Tokenizers are great!", "We train a tiny BPE tokenizer today."]
batch = fast(
    texts,
    padding="longest",
    truncation=True,
    max_length=24,
    return_tensors="pt",
    return_special_tokens_mask=True,
    return_offsets_mapping=True
)
print("\n[2] custom BPE batch keys:", batch.keys())
print("[2] input_ids shape:", batch["input_ids"].shape)             # [B, L]
print("[2] special_tokens_mask:", batch["special_tokens_mask"])
print("[2] tokens[0]:", fast.convert_ids_to_tokens(batch["input_ids"][0]))





[2] custom BPE batch keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'offset_mapping'])
[2] input_ids shape: torch.Size([2, 16])
[2] special_tokens_mask: tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
[2] tokens[0]: ['[CLS]', 'tokenizers', 'ar', 'e', 'g', 'r', 'e', 'at', '[UNK]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


# 3. Aligning tokenizer with a model (vocab_size, special token IDs, resize embeddings)

## 3.1 method1: match vocab size

In [34]:
vocab_size = fast.backend_tokenizer.get_vocab_size()
cfg = BertConfig(
    vocab_size=vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=4,
    intermediate_size=256,
    pad_token_id=fast.pad_token_id
)

model_fresh = BertForSequenceClassification(cfg)

# do a forward once
inputs = fast(
    ["just a quick check.", "and another sentence."],
    padding=True, 
    truncation=True, 
    max_length=16, 
    return_tensors="pt"
)
out = model_fresh(**inputs)   # ok：embeddings 大小与 vocab 对齐
print("\n[C1] fresh model forward ok; logits shape:", out.logits.shape)


[C1] fresh model forward ok; logits shape: torch.Size([2, 2])


## 3.2 load pretrained model, then expand the vocab (need download)

In [35]:
try:
    base_tok = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
    base_model = AutoModel.from_pretrained("bert-base-uncased")
    # add new token（example）
    num_added = base_tok.add_tokens(["[NEW_TAG]"])        # return number of newly added
    base_model.resize_token_embeddings(len(base_tok))     # adjust embedding size
    # fo one more forward
    tmp = base_tok("hello [NEW_TAG] world", return_tensors="pt")
    _ = base_model(**tmp)
    print("[3.2] pretrained + resize embeddings ok; vocab size:", len(base_tok))
except Exception as e:
    print("[3.2] (skipped) cannot load pretrained model:", repr(e))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 