- Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:

In [12]:
pip install -r requirements-extra.txt

Collecting transformers>=4.33.2 (from -r requirements-extra.txt (line 3))
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers>=4.33.2->-r requirements-extra.txt (line 3))
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers>=4.33.2->-r requirements-extra.txt (line 3))
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers>=4.33.2->-r requirements-extra.txt (line 3))
  Using cached safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers>=4.33.2->-r requirements-extra.txt (line 3))
  Using cached hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Downloading transformers-4.56.0-py3-none-

## Comparing Various Byte Pair Encoding (BPE) Implementations

### Using BPE from `tiktoken`

In [1]:
from importlib.metadata import version

print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.11.0


In [2]:
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"

In [3]:
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [4]:
strings = tik_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


In [5]:
print(tik_tokenizer.n_vocab)

50257


### Using the original BPE implementation used in GPT-2

In [6]:
from bpe_openai_gpt2 import get_encoder, download_vocab

In [7]:
download_vocab()

Fetching encoder.json: 1.04Mit [00:00, 5.87Mit/s]                                                   
Fetching vocab.bpe: 457kit [00:00, 4.44Mit/s]                                                       


In [8]:
orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [9]:
integers = orig_tokenizer.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [10]:
strings = orig_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


### Using the BPE via Hugging Face transformers

In [13]:
import transformers
transformers.__version__

'4.56.0'

In [14]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [15]:
hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

In [16]:
from transformers import GPT2TokenizerFast

hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")

In [17]:
hf_tokenizer_fast(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

## Using our own from-scratch BPE tokenizer

In [18]:
import os
import sys
import io
import nbformat
import types

def import_from_notebook():
    def import_definitions_from_notebook(fullname, names):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, "..", "05_bpe-from-scratch", fullname + ".ipynb")
        path = os.path.normpath(path)

        # Load the notebook
        if not os.path.exists(path):
            raise FileNotFoundError(f"Notebook file not found at: {path}")

        with io.open(path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)

        # Create a module to store the imported functions and classes
        mod = types.ModuleType(fullname)
        sys.modules[fullname] = mod

        # Go through the notebook cells and only execute function or class definitions
        for cell in nb.cells:
            if cell.cell_type == "code":
                cell_code = cell.source
                for name in names:
                    # Check for function or class definitions
                    if f"def {name}" in cell_code or f"class {name}" in cell_code:
                        exec(cell_code, mod.__dict__)
        return mod

    fullname = "bpe-from-scratch"
    names = ["BPETokenizerSimple"]

    return import_definitions_from_notebook(fullname, names)

In [19]:
imported_module = import_from_notebook()
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)

tokenizer_gpt2 = BPETokenizerSimple()
tokenizer_gpt2.load_vocab_and_merges_from_openai(
    vocab_path=os.path.join("gpt2_model", "encoder.json"),
    bpe_merges_path=os.path.join("gpt2_model", "vocab.bpe")
)

In [20]:
integers = tokenizer_gpt2.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


# A quick performance benchmark between different BPE

In [22]:
with open("../01_main/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

### Original OpenAI GPT-2 tokenizer

In [23]:
%timeit orig_tokenizer.encode(raw_text)

6.08 ms ± 2.61 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Tiktoken OpenAI GPT-2 tokenizer

In [24]:
%timeit tik_tokenizer.encode(raw_text)

1.26 ms ± 826 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Hugging Face OpenAI GPT-2 tokenizer

In [25]:
%timeit hf_tokenizer(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors


14.5 ms ± 22.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]

14.6 ms ± 32.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
%timeit hf_tokenizer_fast(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors


6.88 ms ± 2.57 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)["input_ids"]

6.89 ms ± 4.19 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### My own GPT-2 tokenizer (for educational purposes)

In [29]:
%timeit tokenizer_gpt2.encode(raw_text)

17.5 ms ± 93.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
