In [1]:
import os 
os.chdir('..')

%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import math
from collections import defaultdict
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


## Analyzing pretrained tokenizers used in baselines

In [2]:
from transformers import AutoTokenizer, GPT2Tokenizer, RobertaTokenizer

In [3]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
print(gpt2_tokenizer)

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})


In [4]:
print("Tokenizer inputs:", gpt2_tokenizer.model_input_names)
print("Tokenization method:", gpt2_tokenizer.tokenize.__name__)
print("Special tokens:", gpt2_tokenizer.special_tokens_map)

Tokenizer inputs: ['input_ids', 'attention_mask']
Tokenization method: tokenize
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [5]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
print(roberta_tokenizer)

RobertaTokenizer(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})


In [6]:
print("Tokenizer inputs:", roberta_tokenizer.model_input_names)
print("Tokenization method:", roberta_tokenizer.tokenize.__name__)
print("Special tokens:", roberta_tokenizer.special_tokens_map)

Tokenizer inputs: ['input_ids', 'attention_mask']
Tokenization method: tokenize
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


In [7]:
opt_tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-125m")
opt_tokenizer.add_bos_token = False
opt_tokenizer.add_special_tokens({
            'bos_token': '<s>', 
            'eos_token': '</s>',
            'unk_token': '<unk>',
            'pad_token': '<pad>',
            'additional_special_tokens': [
                '<image>', '</c>', 
                '<PERSON>', # C-12M for person names
                ]})

print(opt_tokenizer)

GPT2Tokenizer(name_or_path='facebook/opt-125m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '</c>', '<PERSON>']})


In [8]:
print("Tokenizer inputs:", opt_tokenizer.model_input_names)
print("Tokenization method:", opt_tokenizer.tokenize.__name__)
print("Special tokens:", opt_tokenizer.special_tokens_map)

Tokenizer inputs: ['input_ids', 'attention_mask']
Tokenization method: tokenize
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '</c>', '<PERSON>']}


### trying an example

In [9]:
# text from https://huggingface.co/docs/transformers/tokenizer_summary
text = 'As we saw in the preprocessing tutorial, tokenizing a text is splitting it into words or subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text). More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: Byte-Pair Encoding (BPE), WordPiece, and SentencePiece, and show examples of which tokenizer type is used by which model.'

In [10]:
inputs = gpt2_tokenizer([text], padding=False, truncation=False, return_tensors="pt")
print(inputs)
print(gpt2_tokenizer.tokenize(text))

{'input_ids': tensor([[ 1722,   356,  2497,   287,   262,   662, 36948, 11808,    11, 11241,
          2890,   257,  2420,   318, 26021,   340,   656,  2456,   393,   850,
         10879,    11,   543,   788,   389, 11513,   284,   220,  2340,   832,
           257,   804,    12,   929,  3084,    13, 35602,   889,  2456,   393,
           850, 10879,   284,   220,  2340,   318, 15836,    11,   523,   287,
           428, 10638,    11,   356,   481,  2962,   319, 26021,   257,  2420,
           656,  2456,   393,   850, 10879,   357,    72,    13,    68,    13,
         11241,  2890,   257,  2420,   737,  3125,  5734,    11,   356,   481,
           804,   379,   262,  1115,  1388,  3858,   286, 11241, 11341,   973,
           287, 12520,    97,   245, 39185,    25, 30589,    12,    47,   958,
         14711,  7656,   357,    33, 11401,   828,  9678,    47,  8535,    11,
           290, 11352,   594,    47,  8535,    11,   290,   905,  6096,   286,
           543, 11241,  7509,  2099,  

In [11]:
inputs = roberta_tokenizer([text], padding=False, truncation=False, return_tensors="pt")
print(inputs)
print(roberta_tokenizer.tokenize(text))

{'input_ids': tensor([[    0,  1620,    52,   794,    11,     5,  1198, 39221, 35950,     6,
         19233,  2787,    10,  2788,    16, 21128,    24,    88,  1617,    50,
          2849, 30938,     6,    61,   172,    32,  8417,     7,  1437,  7823,
           149,    10,   356,    12,   658,  2103,     4, 36608,  2577,  1617,
            50,  2849, 30938,     7,  1437,  7823,    16, 15196,     6,    98,
            11,    42,  4819,     6,    52,    40,  1056,    15, 21128,    10,
          2788,    88,  1617,    50,  2849, 30938,    36,   118,     4,   242,
             4, 19233,  2787,    10,  2788,   322,   901,  4010,     6,    52,
            40,   356,    23,     5,   130,  1049,  3505,     9, 19233, 11574,
           341,    11,  8103, 10470,  6800, 34379,    35, 46594,    12,   510,
          2456, 14813, 19519,    36,   387, 16035,   238, 15690,   510, 39426,
             6,     8, 12169,  4086,   510, 39426,     6,     8,   311,  7721,
             9,    61, 19233,  6315,  

In [12]:
inputs = opt_tokenizer([text], padding=False, truncation=False, return_tensors="pt")
print(inputs)
print(opt_tokenizer.tokenize(text))

{'input_ids': tensor([[ 1620,    52,   794,    11,     5,  1198, 39221, 35950,     6, 19233,
          2787,    10,  2788,    16, 21128,    24,    88,  1617,    50,  2849,
         30938,     6,    61,   172,    32,  8417,     7,  1437,  7823,   149,
            10,   356,    12,   658,  2103,     4, 36608,  2577,  1617,    50,
          2849, 30938,     7,  1437,  7823,    16, 15196,     6,    98,    11,
            42,  4819,     6,    52,    40,  1056,    15, 21128,    10,  2788,
            88,  1617,    50,  2849, 30938,    36,   118,     4,   242,     4,
         19233,  2787,    10,  2788,   322,   901,  4010,     6,    52,    40,
           356,    23,     5,   130,  1049,  3505,     9, 19233, 11574,   341,
            11,  8103, 10470,  6800, 34379,    35, 46594,    12,   510,  2456,
         14813, 19519,    36,   387, 16035,   238, 15690,   510, 39426,     6,
             8, 12169,  4086,   510, 39426,     6,     8,   311,  7721,     9,
            61, 19233,  6315,  1907,  

In [13]:
gpt2_tokenizer.tokenize(text) == roberta_tokenizer.tokenize(text) == opt_tokenizer.tokenize(text)

True

### Understanding the tokenizer pipeline

In [14]:
print(text)

As we saw in the preprocessing tutorial, tokenizing a text is splitting it into words or subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text). More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: Byte-Pair Encoding (BPE), WordPiece, and SentencePiece, and show examples of which tokenizer type is used by which model.


In [15]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
normalizer = normalizers.Sequence([NFD(), StripAccents()])

In [16]:
normalizer.normalize_str(text)

'As we saw in the preprocessing tutorial, tokenizing a text is splitting it into words or subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text). More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: Byte-Pair Encoding (BPE), WordPiece, and SentencePiece, and show examples of which tokenizer type is used by which model.'

In [17]:
normalizer.normalize_str("Héllò hôw are ü?")

'Hello how are u?'

In [18]:
from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()
pre_tokenizer.pre_tokenize_str(text)

[('As', (0, 2)),
 ('we', (3, 5)),
 ('saw', (6, 9)),
 ('in', (10, 12)),
 ('the', (13, 16)),
 ('preprocessing', (17, 30)),
 ('tutorial', (31, 39)),
 (',', (39, 40)),
 ('tokenizing', (41, 51)),
 ('a', (52, 53)),
 ('text', (54, 58)),
 ('is', (59, 61)),
 ('splitting', (62, 71)),
 ('it', (72, 74)),
 ('into', (75, 79)),
 ('words', (80, 85)),
 ('or', (86, 88)),
 ('subwords', (89, 97)),
 (',', (97, 98)),
 ('which', (99, 104)),
 ('then', (105, 109)),
 ('are', (110, 113)),
 ('converted', (114, 123)),
 ('to', (124, 126)),
 ('ids', (127, 130)),
 ('through', (131, 138)),
 ('a', (139, 140)),
 ('look', (141, 145)),
 ('-', (145, 146)),
 ('up', (146, 148)),
 ('table', (149, 154)),
 ('.', (154, 155)),
 ('Converting', (156, 166)),
 ('words', (167, 172)),
 ('or', (173, 175)),
 ('subwords', (176, 184)),
 ('to', (185, 187)),
 ('ids', (188, 191)),
 ('is', (192, 194)),
 ('straightforward', (195, 210)),
 (',', (210, 211)),
 ('so', (212, 214)),
 ('in', (215, 217)),
 ('this', (218, 222)),
 ('summary', (223, 230))

In [56]:
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
pre_tokenizer.pre_tokenize_str(text)

[('As', (0, 2)),
 ('we', (3, 5)),
 ('saw', (6, 9)),
 ('in', (10, 12)),
 ('the', (13, 16)),
 ('preprocessing', (17, 30)),
 ('tutorial', (31, 39)),
 (',', (39, 40)),
 ('tokenizing', (41, 51)),
 ('a', (52, 53)),
 ('text', (54, 58)),
 ('is', (59, 61)),
 ('splitting', (62, 71)),
 ('it', (72, 74)),
 ('into', (75, 79)),
 ('words', (80, 85)),
 ('or', (86, 88)),
 ('subwords', (89, 97)),
 (',', (97, 98)),
 ('which', (99, 104)),
 ('then', (105, 109)),
 ('are', (110, 113)),
 ('converted', (114, 123)),
 ('to', (124, 126)),
 ('ids', (127, 130)),
 ('through', (131, 138)),
 ('a', (139, 140)),
 ('look', (141, 145)),
 ('-', (145, 146)),
 ('up', (146, 148)),
 ('table', (149, 154)),
 ('.', (154, 155)),
 ('Converting', (156, 166)),
 ('words', (167, 172)),
 ('or', (173, 175)),
 ('subwords', (176, 184)),
 ('to', (185, 187)),
 ('ids', (188, 191)),
 ('is', (192, 194)),
 ('straightforward', (195, 210)),
 (',', (210, 211)),
 ('so', (212, 214)),
 ('in', (215, 217)),
 ('this', (218, 222)),
 ('summary', (223, 230))

In [20]:
from tokenizers.processors import TemplateProcessing
gpt2_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
)

In [21]:
gpt2_tokenizer(text)

{'input_ids': [1722, 356, 2497, 287, 262, 662, 36948, 11808, 11, 11241, 2890, 257, 2420, 318, 26021, 340, 656, 2456, 393, 850, 10879, 11, 543, 788, 389, 11513, 284, 220, 2340, 832, 257, 804, 12, 929, 3084, 13, 35602, 889, 2456, 393, 850, 10879, 284, 220, 2340, 318, 15836, 11, 523, 287, 428, 10638, 11, 356, 481, 2962, 319, 26021, 257, 2420, 656, 2456, 393, 850, 10879, 357, 72, 13, 68, 13, 11241, 2890, 257, 2420, 737, 3125, 5734, 11, 356, 481, 804, 379, 262, 1115, 1388, 3858, 286, 11241, 11341, 973, 287, 12520, 97, 245, 39185, 25, 30589, 12, 47, 958, 14711, 7656, 357, 33, 11401, 828, 9678, 47, 8535, 11, 290, 11352, 594, 47, 8535, 11, 290, 905, 6096, 286, 543, 11241, 7509, 2099, 318, 973, 416, 543, 2746, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [23]:
gpt2_tokenizer.decode(gpt2_tokenizer(text)['input_ids'])

'As we saw in the preprocessing tutorial, tokenizing a text is splitting it into words or subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text). More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: Byte-Pair Encoding (BPE), WordPiece, and SentencePiece, and show examples of which tokenizer type is used by which model.'

In [24]:
gpt2_tokenizer.decode(gpt2_tokenizer(text)['input_ids']) == text

True

## Training a tokenizer from scratch

In [25]:
paths = [str(x) for x in Path("../datasets/babylm_10M/").glob("*.train")]
paths

['../datasets/babylm_10M/bnc_spoken.train',
 '../datasets/babylm_10M/children_stories.train',
 '../datasets/babylm_10M/cbt.train',
 '../datasets/babylm_10M/switchboard.train',
 '../datasets/babylm_10M/wikipedia.train',
 '../datasets/babylm_10M/gutenberg.train',
 '../datasets/babylm_10M/aochildes.train',
 '../datasets/babylm_10M/qed.train',
 '../datasets/babylm_10M/simple_wikipedia.train',
 '../datasets/babylm_10M/open_subtitles.train']

In [26]:
dataset = load_dataset(path=os.path.join('./src/babylm_baseline_train/datasets', "babyLM_for_hf.py"),
            name='babyLM-10M',
            split='train')

Found cached dataset baby_lm_for_hf (/home/misra/.cache/huggingface/datasets/baby_lm_for_hf/babyLM-10M/1.0.0/281c1a7c3ebf0b682e9bdca60f4a2442b6aaf2d2a266fea843461e98f10a5f07)


Three main tokenizer types: 
- Byte-Pair Encoding (BPE)
- WordPiece
- SentencePiece

### Byte-Pair Encoding Tokenizer

In [55]:
from tokenizers import Tokenizer
from tokenizers import ByteLevelBPETokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents

In [28]:
tokenizer = ByteLevelBPETokenizer()
tokenizer

Tokenizer(vocabulary_size=0, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [29]:
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [30]:
tokenizer.normalizer

<tokenizers.normalizers.Sequence at 0x7fc94a0a78f0>

In [31]:
tokenizer.pre_tokenizer

<tokenizers.pre_tokenizers.ByteLevel at 0x7fc94a0a3470>

In [32]:
tokenizer.decoder

<tokenizers.decoders.ByteLevel at 0x7fc94a08a660>

In [33]:
tokenizer.post_processor

<tokenizers.processors.ByteLevel at 0x7fc972f94210>

In [34]:
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])






In [35]:
tokenizer.encode(text)

Encoding(num_tokens=126, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [36]:
tokenizer.encode(text).tokens

['as',
 'Ġwe',
 'Ġsaw',
 'Ġin',
 'Ġthe',
 'Ġpre',
 'process',
 'ing',
 'Ġtutorial',
 ',',
 'Ġtoken',
 'izing',
 'Ġa',
 'Ġtext',
 'Ġis',
 'Ġsplitting',
 'Ġit',
 'Ġinto',
 'Ġwords',
 'Ġor',
 'Ġsub',
 'words',
 ',',
 'Ġwhich',
 'Ġthen',
 'Ġare',
 'Ġconverted',
 'Ġto',
 'Ġids',
 'Ġthrough',
 'Ġa',
 'Ġlook',
 '-',
 'up',
 'Ġtable',
 '.',
 'Ġconverting',
 'Ġwords',
 'Ġor',
 'Ġsub',
 'words',
 'Ġto',
 'Ġids',
 'Ġis',
 'Ġstraightforward',
 ',',
 'Ġso',
 'Ġin',
 'Ġthis',
 'Ġsummary',
 ',',
 'Ġwe',
 'Ġwill',
 'Ġfocus',
 'Ġon',
 'Ġsplitting',
 'Ġa',
 'Ġtext',
 'Ġinto',
 'Ġwords',
 'Ġor',
 'Ġsub',
 'words',
 'Ġ(',
 'i',
 '.',
 'e',
 '.',
 'Ġtoken',
 'izing',
 'Ġa',
 'Ġtext',
 ').',
 'Ġmore',
 'Ġspecifically',
 ',',
 'Ġwe',
 'Ġwill',
 'Ġlook',
 'Ġat',
 'Ġthe',
 'Ġthree',
 'Ġmain',
 'Ġtypes',
 'Ġof',
 'Ġtoken',
 'izers',
 'Ġused',
 'Ġin',
 'Ġ',
 'ð',
 'Ł',
 '¤',
 'Ĺ',
 'Ġtransform',
 'ers',
 ':',
 'Ġbyte',
 '-',
 'pair',
 'Ġencoding',
 'Ġ(',
 'b',
 'pe',
 '),',
 'Ġword',
 'piece',
 ',',
 'Ġand',
 'Ġ

In [37]:
tokenizer.save("tokenizers/babylm_10M_BPE.json")

### WordPiece Tokenizer

In [50]:
wordpiece_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [51]:
wordpiece_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [52]:
wordpiece_tokenizer.pre_tokenizer = Whitespace()

In [53]:
wordpiece_tokenizer.decode

<function Tokenizer.decode(self, ids, skip_special_tokens=True)>

In [54]:
wordpiece_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $0 [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
)

In [56]:
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [57]:
wordpiece_tokenizer.train(paths, trainer)






In [59]:
wordpiece_tokenizer.encode(text)

Encoding(num_tokens=126, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [60]:
wordpiece_tokenizer.encode(text).tokens

['[CLS]',
 'as',
 'we',
 'saw',
 'in',
 'the',
 'prep',
 '##rocess',
 '##ing',
 'tutorial',
 ',',
 'token',
 '##izing',
 'a',
 'text',
 'is',
 'splitting',
 'it',
 'into',
 'words',
 'or',
 'sub',
 '##words',
 ',',
 'which',
 'then',
 'are',
 'converted',
 'to',
 'ids',
 'through',
 'a',
 'look',
 '-',
 'up',
 'table',
 '.',
 'converting',
 'words',
 'or',
 'sub',
 '##words',
 'to',
 'ids',
 'is',
 'straightforward',
 ',',
 'so',
 'in',
 'this',
 'summary',
 ',',
 'we',
 'will',
 'focus',
 'on',
 'splitting',
 'a',
 'text',
 'into',
 'words',
 'or',
 'sub',
 '##words',
 '(',
 'i',
 '.',
 'e',
 '.',
 'token',
 '##izing',
 'a',
 'text',
 ').',
 'more',
 'specifically',
 ',',
 'we',
 'will',
 'look',
 'at',
 'the',
 'three',
 'main',
 'types',
 'of',
 'token',
 '##izers',
 'used',
 'in',
 '[UNK]',
 'transform',
 '##ers',
 ':',
 'by',
 '##te',
 '-',
 'pair',
 'enc',
 '##oding',
 '(',
 'bp',
 '##e',
 '),',
 'word',
 '##piece',
 ',',
 'and',
 'sentence',
 '##piece',
 ',',
 'and',
 'show',
 '

In [58]:
wordpiece_tokenizer.save("tokenizers/babylm_10M_wordpiece.json")

### SentencePiece Tokenizer

In [61]:
from tokenizers import SentencePieceBPETokenizer

In [63]:
sentence_tokenizer = SentencePieceBPETokenizer()

In [64]:
sentence_tokenizer

Tokenizer(vocabulary_size=0, model=SentencePieceBPE, unk_token=<unk>, replacement=▁, add_prefix_space=True, dropout=None)

In [65]:
sentence_tokenizer.normalizer

<tokenizers.normalizers.NFKC at 0x7fc946831630>

In [66]:
sentence_tokenizer.pre_tokenizer

<tokenizers.pre_tokenizers.Metaspace at 0x7fc946839e70>

In [67]:
sentence_tokenizer.decoder

<tokenizers.decoders.Metaspace at 0x7fc972f94780>

In [68]:
sentence_tokenizer.post_processor

In [70]:
sentence_tokenizer.train(files=paths, 
    vocab_size=30_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=500,
)






In [72]:
sentence_tokenizer.encode(text)

Encoding(num_tokens=135, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [73]:
sentence_tokenizer.encode(text).tokens

['▁As',
 '▁we',
 '▁saw',
 '▁in',
 '▁the',
 '▁prep',
 'ro',
 'cess',
 'ing',
 '▁tut',
 'orial',
 ',',
 '▁token',
 'izing',
 '▁a',
 '▁text',
 '▁is',
 '▁spl',
 'itting',
 '▁it',
 '▁into',
 '▁words',
 '▁or',
 '▁sub',
 'word',
 's,',
 '▁which',
 '▁then',
 '▁are',
 '▁converted',
 '▁to',
 '▁id',
 's',
 '▁through',
 '▁a',
 '▁look',
 '-up',
 '▁table.',
 '▁Con',
 'ver',
 'ting',
 '▁words',
 '▁or',
 '▁sub',
 'word',
 's',
 '▁to',
 '▁id',
 's',
 '▁is',
 '▁straight',
 'for',
 'ward,',
 '▁so',
 '▁in',
 '▁this',
 '▁summ',
 'ary,',
 '▁we',
 '▁will',
 '▁focus',
 '▁on',
 '▁spl',
 'itting',
 '▁a',
 '▁text',
 '▁into',
 '▁words',
 '▁or',
 '▁sub',
 'word',
 's',
 '▁(i.e.',
 '▁token',
 'izing',
 '▁a',
 '▁text',
 ').',
 '▁More',
 '▁specific',
 'ally,',
 '▁we',
 '▁will',
 '▁look',
 '▁at',
 '▁the',
 '▁three',
 '▁main',
 '▁types',
 '▁of',
 '▁token',
 'iz',
 'ers',
 '▁used',
 '▁in',
 '▁',
 '<unk>',
 '▁Trans',
 'form',
 'ers',
 ':',
 '▁By',
 'te',
 '-P',
 'air',
 '▁En',
 'cod',
 'ing',
 '▁(B',
 'P',
 'E',
 '),',
 

In [71]:
sentence_tokenizer.save("tokenizers/babylm_10M_sentencepiece.json")