In [1]:
from tokenizers import Tokenizer

from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [2]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])


In [3]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

files = "./dataset/standard_hex.txt"

tokenizer.train(files=[files], trainer=trainer)






In [4]:
tokenizer.save("dataset/standard_hex.json")

In [5]:
output = tokenizer.encode("0101EFF0 0105F6FA 010DFC0C FF00F7FC 0006F902 010EFB0C 080AF8FD 0111FC10")

In [8]:
output.ids

[9840, 9246, 6453, 10580, 4066, 9905, 443, 214, 18318]

In [9]:
output.offsets

[(0, 8),
 (9, 17),
 (18, 26),
 (27, 35),
 (36, 44),
 (45, 53),
 (54, 58),
 (58, 62),
 (63, 71)]

In [10]:
tokenizer.token_to_id("[SEP]")

2

In [11]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [12]:
output = tokenizer.encode(
    "14140710 F90FF20F F0F2ECEC 0014ED13 F612F20D F501F2F6 F9FDEFF3 F904F4F7"
)

In [13]:
print(output.tokens)
print(output.type_ids)

['[CLS]', '1414', '0710', 'F9', '0FF20F', 'F0F2ECEC', '0014', 'ED13', 'F612', 'F20D', 'F501', 'F2F6', 'F9FD', 'EFF3', 'F904F4F7', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [3]:
output = tokenizer.encode(
    "14140710 F90FF20F F0F2ECEC 0014ED13", "F612F20D F501F2F6 F9FDEFF3 F904F4F7"
)
print(output.tokens)

['1414', '0710', 'F9', '0FF20F', 'F0F2ECEC', '0014', 'ED13', 'F612', 'F20D', 'F501', 'F2F6', 'F9FD', 'EFF3', 'F904F4F7']


In [4]:
print(output.type_ids)

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [16]:
tokenizer.save("dataset/with_post_process_tokenizer.json")

In [6]:
output = tokenizer.encode_batch(
    ["14140710 F90FF20F F0F2ECEC 0014ED13", "F612F20D F501F2F6 F9FDEFF3 F904F4F7"]
)
print(output[0].tokens)

['1414', '0710', 'F9', '0FF20F', 'F0F2ECEC', '0014', 'ED13']


In [8]:
out = tokenizer.decode(output[0].ids)
print(out)

1414 0710 F9 0FF20F F0F2ECEC 0014 ED13


In [19]:
tokenizer.normalizer

In [2]:
tokenizer = Tokenizer.from_file("dataset/bpe_standard_hex.json")