In [15]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "dataset/tokenizer/custom_tokenizer"
)


def test_tokenize_single_line(tokenizer, line: str):
    tokens = tokenizer.tokenize(line)
    print("原始句子:", line)
    print("分词结果:", tokens)
    return tokens


def test_token_to_ids(tokenizer, line: str):
    token_ids = tokenizer.encode(line)
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    print("输入:", line)
    print("tokens:", tokens)
    print("token_ids:", token_ids)


def test_encode_decode(tokenizer, line: str):
    token_ids = tokenizer.encode(line)
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
    print("原始:", line)
    print("编码:", token_ids)
    print("解码:", decoded)


def test_unknown_token(tokenizer, token: str):
    token_ids = tokenizer.encode(token)
    token_out = tokenizer.convert_ids_to_tokens(token_ids)
    print("测试 token:", token)
    print("编码 ID:", token_ids)
    print("映射 token:", token_out)
    if "[UNK]" in token_out:
        print("✅ 未知词成功映射为 [UNK]")
    else:
        print("✅ 词汇在词表中")

In [5]:
tokens = test_tokenize_single_line(tokenizer, "0103F4F4 FA02F5F8 FC05FBFD")

原始句子: 0103F4F4 FA02F5F8 FC05FBFD
分词结果: ['0103F4F4', 'FA02F5F8', 'FC05FBFD']


In [7]:
test_token_to_ids(tokenizer, "0103F4F4 FA02F5F8 FC05FBFD")

输入: 0103F4F4 FA02F5F8 FC05FBFD
tokens: ['0103F4F4', 'FA02F5F8', 'FC05FBFD']
token_ids: [8462, 141499, 155624]


In [14]:
test_encode_decode(tokenizer, "0103F4F4 FA02F5F8 FC05FBFD")

原始: 0103F4F4 FA02F5F8 FC05FBFD
编码: [8462, 141499, 155624]
解码: 0103F4F4 FA02F5F8 FC05FBFD


In [16]:
test_unknown_token(tokenizer, "0103F4F4 FA02F5F8 FC05FBXA")


测试 token: 0103F4F4 FA02F5F8 FC05FBXA
编码 ID: [8462, 141499, 1]
映射 token: ['0103F4F4', 'FA02F5F8', '[UNK]']
✅ 未知词成功映射为 [UNK]
