In [1]:
import json
from tqdm import tqdm
train_data = []
with open("tokenizer_train.jsonl", "r") as fr:
    for line in tqdm(fr):
        line = json.loads(line)
        train_data.append(line)
def yield_text():
    for line in train_data:
        yield line["text"]

600000it [00:07, 75028.33it/s] 


In [2]:
train_data[0]

{'text': '好的。现在请你将这个文本中的所有的逗号都替换成空格。 好的，请稍等一下，现在我会将文本中的所有逗号替换为空格。处理后文本为："这是一个句子 目的是看看是否可以正确地从这个句子中删除关键词。"。处理结果如何？'}

In [None]:
import os
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

special_tokens = ["<unk>", "<s>", "</s>"]
trainer = BpeTrainer(
    vocab_size=8000,
    special_tokens=special_tokens,  # 确保这三个token被包含
    show_progress=True,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)

texts = yield_text()
tokenizer.train_from_iterator(texts, trainer=trainer)
tokenizer.train()

In [8]:
# 设置解码器
tokenizer.decoder = decoders.ByteLevel()

# 检查特殊token的索引
assert tokenizer.token_to_id("<unk>") == 0
assert tokenizer.token_to_id("<s>") == 1
assert tokenizer.token_to_id("</s>") == 2

In [15]:
tokenizer_dir = "./my_tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_dir, "tokenizer.json"))
tokenizer.model.save("./my_tokenizer/")

['./my_tokenizer/vocab.json', './my_tokenizer/merges.txt']

In [16]:
    # 手动创建配置文件
    config = {
        "add_bos_token": False,
        "add_eos_token": False,
        "add_prefix_space": True,
        "added_tokens_decoder": {
            "0": {
                "content": "<unk>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            },
            "1": {
                "content": "<s>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            },
            "2": {
                "content": "</s>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            }
        },
        "additional_special_tokens": [],
        "bos_token": "<s>",
        "clean_up_tokenization_spaces": False,
        "eos_token": "</s>",
        "legacy": True,
        "model_max_length": 1000000000000000019884624838656,
        "pad_token": None,
        "sp_model_kwargs": {},
        "spaces_between_special_tokens": False,
        "tokenizer_class": "PreTrainedTokenizerFast",
        "unk_token": "<unk>",
        "use_default_system_prompt": False,
        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}"
    }

    # 保存配置文件
    with open(os.path.join(tokenizer_dir, "tokenizer_config.json"), "w", encoding="utf-8") as config_file:
        json.dump(config, config_file, ensure_ascii=False, indent=4)

In [4]:
## 验证tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./my_tokenizer")
print(tokenizer)

messages = [
    {"role": "system", "content": "你是一个优秀的聊天机器人，总是给我正确的回应！"},
    {"role": "user", "content": '你来自哪里？'},
    {"role": "assistant", "content": '我来自地球'}
]
new_prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False
)
print(new_prompt)

# 获取实际词汇表长度（包括特殊符号）
actual_vocab_size = len(tokenizer)
print('tokenizer实际词表长度：', actual_vocab_size)

model_inputs = tokenizer(new_prompt)
print('encoder长度：', len(model_inputs['input_ids']))

input_ids = model_inputs['input_ids']
response = tokenizer.decode(input_ids)
print(response)
print('decoder和原始文本是否一致：', response == new_prompt)

PreTrainedTokenizerFast(name_or_path='./my_tokenizer', vocab_size=8000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)
你是一个优秀的聊天机器人，总是给我正确的回应！<s>user
你来自哪里？</s>
<s>assistant
我来自地球</s>

tokenizer实际词表长度： 8000
encoder长度： 33
 你是一个优秀的聊天机器人，总是给我正确的回应！<s> user
你来自哪里？</s> 
<s> assistant
我来自地球</s> 

decoder和原始文本是否一致： False
