In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [2]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

In [3]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [4]:
tokenizer.train(["/content/roman_01.txt", "/content/roman_02.txt"], trainer)

In [5]:
tokenizer.save("/content/tokenizers.json")

In [9]:
tokenizer = Tokenizer.from_file("/content/tokenizers.json")

## Normalization to normalize text

In [10]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

normalizer = normalizers.Sequence([NFD(), StripAccents()])

In [11]:
normalizer.normalize_str("Héllò hôw are ü?")

'Hello how are u?'

In [12]:
# we can customize the normalizer
normalizer.normalizer = normalizer

## PreTokenization

In [14]:
from tokenizers.pre_tokenizers import Whitespace

pre_tokenizer = Whitespace()

pre_tokenizer.pre_tokenize_str("Hello there how are you?")

[('Hello', (0, 5)),
 ('there', (6, 11)),
 ('how', (12, 15)),
 ('are', (16, 19)),
 ('you', (20, 23)),
 ('?', (23, 24))]

In [15]:
# combine any pretokenizer together
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits

pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])

In [16]:
pre_tokenizer.pre_tokenize_str("12345667890hello0293")

[('1', (0, 1)),
 ('2', (1, 2)),
 ('3', (2, 3)),
 ('4', (3, 4)),
 ('5', (4, 5)),
 ('6', (5, 6)),
 ('6', (6, 7)),
 ('7', (7, 8)),
 ('8', (8, 9)),
 ('9', (9, 10)),
 ('0', (10, 11)),
 ('hello', (11, 16)),
 ('0', (16, 17)),
 ('2', (17, 18)),
 ('9', (18, 19)),
 ('3', (19, 20))]

In [18]:
# or we can customize pretokenizer

tokenizer.pre_tokenizer = pre_tokenizer

Once the input texts are normalized and pretokenized the `Tokenizer` applies the model on the pre-tokens of we are training.

## Other tokenizer:
- BPE
- Unigram
- WordLevel
- WordPiece