In [None]:
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer

In [None]:
s = "OpenAI tokenization lowerd."

In [None]:
d = TreebankWordDetokenizer()
t = TreebankWordTokenizer()

In [None]:
toks = t.tokenize(s)

In [None]:
toks

['OpenAI', 'tokenization', 'lowerd', '.']

In [None]:
d.detokenize(toks)

"I can't believe it's already 5 o'clock!"

## BPE

Data: [https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt](https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt)

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [None]:
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=1000,
    min_frequency=1,
    show_progress=True
)

In [None]:
tokenizer.pre_tokenizer = Whitespace()
file_path = "data.txt"
tokenizer.train(files=[file_path], trainer=trainer)

In [None]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2)]
)

In [None]:
test_cases = [
    "department",
    "OpenAI",
    "tokenization"]

In [None]:
for text in test_cases:
    encoded = tokenizer.encode(text)
    print(f"'{text}' → {encoded.tokens}")

'department' → ['[CLS]', 'de', 'part', 'ment', '[SEP]']
'OpenAI' → ['[CLS]', 'O', 'pen', 'A', 'I', '[SEP]']
'tokenization' → ['[CLS]', 'to', 'ken', 'i', 'z', 'ation', '[SEP]']


## Word piece

In [None]:
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

In [None]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(
    vocab_size=1000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    min_frequency=2
)

In [None]:
files = ["data.txt"]
tokenizer.train(files, trainer)

In [None]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B [SEP]",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [None]:
test_cases = [
    "department",
    "OpenAI",
    "through"]

In [None]:
for text in test_cases:
    encoded = tokenizer.encode(text)
    print(f"'{text}' → {encoded.tokens}")

'department' → ['[CLS]', 'de', '##p', '##art', '##ment', '[SEP]']
'OpenAI' → ['[CLS]', '[UNK]', '[SEP]']
'through' → ['[CLS]', 'th', '##rough', '[SEP]']
