In [None]:
# import libraries
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast

In [None]:
# load tokenizer
bpe_tokenizer = Tokenizer.from_file("/workspaces/ner_news_malay/tokenizer/tokenizer_model/malay_news_bpe_tokenizer.json")

In [6]:
# wrap tokenizer
wrap_bpe_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=bpe_tokenizer,
    bos_token="[CLS]",
    eos_token="[SEP]",
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]"
)

In [None]:
# save wrapped tokenizer
import os

tokenizer_wrapped_folder = 'tokenizer_wrapped_model'                                                        # setup save folder
os.makedirs(tokenizer_wrapped_folder, exist_ok=True)

wrap_bpe_tokenizer.save_pretrained(tokenizer_wrapped_folder)

('tokenizer_wrapped_model/tokenizer_config.json',
 'tokenizer_wrapped_model/special_tokens_map.json',
 'tokenizer_wrapped_model/tokenizer.json')

In [None]:
# test wrapped tokenizer
test_sentence = "Harga minyak sawit meningkat 15% di pasaran antarabangsa"
encoding = wrap_bpe_tokenizer(
    test_sentence,
    padding="max_length",
    max_length=32,
    truncation=True,
    return_tensors="pt"
)

print("Tokenization test:")
print(f"Original: {test_sentence}")
print(f"Tokens: {wrap_bpe_tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])}")

Tokenization test:
Original: Harga minyak sawit meningkat 15% di pasaran antarabangsa
Tokens: ['[UNK]', 'ar', 'ga', 'minyak', 'sa', 'w', 'it', 'mening', 'kat', '1', '5', '[UNK]', 'di', 'pasaran', 'antarabangsa', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
