In [1]:
from datasets import load_dataset

# 加载数据集
dataset = load_dataset('roneneldan/TinyStories')

In [2]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence
from transformers import PreTrainedTokenizerFast
from tqdm import tqdm

# 抽取所有文本（这里假设字段名是 'text'，你可以根据实际调整）
def get_all_texts(split):
    # return [item['text'] for item in dataset[split]]
    return dataset[split]["text"]

train_texts = get_all_texts('train')
validation_texts = get_all_texts('validation')

len(train_texts), len(validation_texts)

(2119719, 21990)

In [3]:
def iter_with_progress():
    for text in tqdm(train_texts, desc="Training tokenizer"):
        yield text


# 50257 is the default vocab size for GPT-2
vocab_sizes = [1000, 2000, 4000, 8000, 16000, 32000, 50257]

for vocab_size in vocab_sizes:
    print(f"Training tokenizer with vocab size: {vocab_size}")
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,  # 词表大小可调
        special_tokens=["[PAD]", "[UNK]", "<s>", "</s>"],
    )
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.train_from_iterator(iter_with_progress(), trainer=trainer)
    filename = f"tinystories_bpe_tokenizer_{vocab_size//1000}K.json"
    tokenizer.save(filename)
    hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file=filename, unk_token="[UNK]", pad_token="[PAD]", bos_token="<s>", eos_token="</s>")
    hf_tokenizer.save_pretrained(f"tinystories_bpe_tokenizer/{vocab_size//1000}K")
    
    
    sentences = [
        "Once upon a time, there was a cat.",
        "The cat loved to chase butterflies.",
        "One day, it caught a butterfly and became friends with it.",
        "huggingface transformers is a great library for NLP tasks.",
        "The ephemeral zephyr whispered through the desolate moor at twilight.",
    ]

    for sentence in sentences:
        encoded = tokenizer.encode(sentence)
        # print(f"Encoded IDs for '{sentence}':", encoded.ids)
        print(f"Tokens for '{sentence}':", encoded.tokens)

Training tokenizer with vocab size: 1000


Training tokenizer: 100%|██████████| 2119719/2119719 [00:48<00:00, 44119.40it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'ch', 'ase', 'butter', 'f', 'li', 'es', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'ca', 'ught', 'a', 'butter', 'fly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hug', 'g', 'ing', 'face', 't', 'ran', 's', 'for', 'm', 'ers', 'is', 'a', 'great', 'li', 'br', 'ar', 'y', 'for', 'n', 'l', 'p', 't', 'ask', 's', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'm', 'er', 'al', 'z', 'ep', 'h', 'y', 'r', 'w', 'his', 'per', 'ed', 'through', 'the', 'de', 'so', 'l', 'ate', 'mo', 'or', 'at', 't', 'w', 'il', 'ight', '.']
Training tokenizer with vocab size: 2000


Training tokenizer: 100%|██████████| 2119719/2119719 [00:47<00:00, 44643.08it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'ch', 'ase', 'butter', 'fli', 'es', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'caught', 'a', 'butterfly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hug', 'g', 'ing', 'face', 't', 'ran', 's', 'for', 'm', 'ers', 'is', 'a', 'great', 'li', 'br', 'ary', 'for', 'n', 'l', 'p', 't', 'asks', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'm', 'er', 'al', 'z', 'ep', 'h', 'y', 'r', 'whis', 'per', 'ed', 'through', 'the', 'de', 'so', 'late', 'mo', 'or', 'at', 'tw', 'il', 'ight', '.']
Training tokenizer with vocab size: 4000


Training tokenizer: 100%|██████████| 2119719/2119719 [00:50<00:00, 42195.96it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'chase', 'butterflies', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'caught', 'a', 'butterfly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hug', 'ging', 'face', 't', 'ran', 's', 'form', 'ers', 'is', 'a', 'great', 'library', 'for', 'n', 'l', 'p', 't', 'asks', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'mer', 'al', 'z', 'ep', 'h', 'y', 'r', 'whispered', 'through', 'the', 'de', 'so', 'late', 'mo', 'or', 'at', 'tw', 'il', 'ight', '.']
Training tokenizer with vocab size: 8000


Training tokenizer: 100%|██████████| 2119719/2119719 [00:56<00:00, 37723.07it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'chase', 'butterflies', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'caught', 'a', 'butterfly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hugging', 'face', 'trans', 'form', 'ers', 'is', 'a', 'great', 'library', 'for', 'n', 'l', 'p', 'tasks', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'mer', 'al', 'z', 'ep', 'h', 'y', 'r', 'whispered', 'through', 'the', 'de', 'so', 'late', 'mo', 'or', 'at', 'tw', 'il', 'ight', '.']
Training tokenizer with vocab size: 16000


Training tokenizer: 100%|██████████| 2119719/2119719 [00:44<00:00, 47379.25it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'chase', 'butterflies', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'caught', 'a', 'butterfly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hugging', 'face', 'transform', 'ers', 'is', 'a', 'great', 'library', 'for', 'n', 'l', 'p', 'tasks', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'mer', 'al', 'z', 'ep', 'hy', 'r', 'whispered', 'through', 'the', 'de', 'so', 'late', 'mo', 'or', 'at', 'twilight', '.']
Training tokenizer with vocab size: 32000


Training tokenizer: 100%|██████████| 2119719/2119719 [01:02<00:00, 33744.54it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'chase', 'butterflies', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'caught', 'a', 'butterfly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hugging', 'face', 'transform', 'ers', 'is', 'a', 'great', 'library', 'for', 'n', 'l', 'p', 'tasks', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'mer', 'al', 'z', 'ep', 'hy', 'r', 'whispered', 'through', 'the', 'desolate', 'mo', 'or', 'at', 'twilight', '.']
Training tokenizer with vocab size: 50257


Training tokenizer: 100%|██████████| 2119719/2119719 [00:51<00:00, 41517.29it/s]





Tokens for 'Once upon a time, there was a cat.': ['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'cat', '.']
Tokens for 'The cat loved to chase butterflies.': ['the', 'cat', 'loved', 'to', 'chase', 'butterflies', '.']
Tokens for 'One day, it caught a butterfly and became friends with it.': ['one', 'day', ',', 'it', 'caught', 'a', 'butterfly', 'and', 'became', 'friends', 'with', 'it', '.']
Tokens for 'huggingface transformers is a great library for NLP tasks.': ['hugging', 'face', 'transform', 'ers', 'is', 'a', 'great', 'library', 'for', 'n', 'l', 'p', 'tasks', '.']
Tokens for 'The ephemeral zephyr whispered through the desolate moor at twilight.': ['the', 'ep', 'he', 'mer', 'al', 'zephyr', 'whispered', 'through', 'the', 'desolate', 'moor', 'at', 'twilight', '.']
