# BPE Training

Maintain the following states:
* `vocab`: maps int index to bytes array. !!! vocab's index is NOT codepoint.
* `merges`: maps pair of vocab indexes to newly merged vocab index, OrderedDict.

In [1]:
import json
import time

from adapters import run_train_bpe
from common import FIXTURES_PATH, gpt2_bytes_to_unicode

  rope_theta (float): The RoPE $\Theta$ parameter.

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/chenwang/Desktop/github/cs336/assignment1-basics/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/chenwang/Desktop/github/cs336/assignment1-basics/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/chen

In [2]:
def test_train_bpe():
    input_path = FIXTURES_PATH / "corpus.en"
    vocab, merges = run_train_bpe(
        input_path=input_path,
        vocab_size=500,
        special_tokens=["<|endoftext|>"],
    )

    # Path to the reference tokenizer vocab and merges
    reference_vocab_path = FIXTURES_PATH / "train-bpe-reference-vocab.json"
    reference_merges_path = FIXTURES_PATH / "train-bpe-reference-merges.txt"

    # Compare the learned merges to the expected output merges
    gpt2_byte_decoder = {v: k for k, v in gpt2_bytes_to_unicode().items()}
    with open(reference_merges_path, encoding="utf-8") as f:
        gpt2_reference_merges = [tuple(line.rstrip().split(" ")) for line in f]
        reference_merges = [
            (
                bytes([gpt2_byte_decoder[token] for token in merge_token_1]),
                bytes([gpt2_byte_decoder[token] for token in merge_token_2]),
            )
            for merge_token_1, merge_token_2 in gpt2_reference_merges
        ]
    assert merges == reference_merges

    # Compare the vocab to the expected output vocab
    with open(reference_vocab_path, encoding="utf-8") as f:
        gpt2_reference_vocab = json.load(f)
        reference_vocab = {
            gpt2_vocab_index: bytes([gpt2_byte_decoder[token] for token in gpt2_vocab_item])
            for gpt2_vocab_item, gpt2_vocab_index in gpt2_reference_vocab.items()
        }
    # Rather than checking that the vocabs exactly match (since they could
    # have been constructed differently, we'll make sure that the vocab keys and values match)
    assert set(vocab.keys()) == set(reference_vocab.keys())
    assert set(vocab.values()) == set(reference_vocab.values())


In [3]:
test_train_bpe()


        /Users/chenwang/Desktop/github/cs336/assignment1-basics/tests/fixtures/corpus.en
        500
        ['<|endoftext|>']
        {}
        


  rope_theta (float): The RoPE $\Theta$ parameter.


NotImplementedError: 

In [None]:
input_path = \
    '/Users/chenwang/Desktop/github/cs336/assignment1-basics/tests/fixtures/corpus.en'

In [42]:
with open(input_path, 'r') as f:
    text = f.read()
print(text[:100])

iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould


In [58]:
import regex as re

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
pretokens = re.findall(PAT, text)
print(pretokens)
print(text[:100])

iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould


In [43]:
codepoints = list(map(ord, text))
print(codepoints[:100])

[105, 114, 111, 110, 32, 99, 101, 109, 101, 110, 116, 32, 105, 115, 32, 97, 32, 114, 101, 97, 100, 121, 32, 102, 111, 114, 32, 117, 115, 101, 32, 112, 97, 115, 116, 101, 32, 119, 104, 105, 99, 104, 32, 105, 115, 32, 108, 97, 105, 100, 32, 97, 115, 32, 97, 32, 102, 105, 108, 108, 101, 116, 32, 98, 121, 32, 112, 117, 116, 116, 121, 32, 107, 110, 105, 102, 101, 32, 111, 114, 32, 102, 105, 110, 103, 101, 114, 32, 105, 110, 32, 116, 104, 101, 32, 109, 111, 117, 108, 100]


In [51]:
from collections import Counter, OrderedDict
from itertools import pairwise

counter = Counter()
for p1, p2 in pairwise(codepoints):
    counter[(p1, p2)] += 1
print('\n'.join(
    [f'{chr(p1)}{chr(p2)}: {cnt}'
     for (p1, p2), cnt in counter.most_common(5)]))

vocab_size = 500
special_tokens = ['<|endoftext|>']

vocabs = {i: bytes([i]) for i in range(256)}
for st in special_tokens:
    vocabs[len(vocabs)] = st.encode('utf-8')

merges = []
while len(vocabs) < vocab_size and counter:
    (cp1, cp2), _ = counter.most_common(1)[0]
    del counter[(cp1, cp2)]
    idx = len(vocabs)
    vocabs[idx] = bytes([cp1, cp2])
    merges.append((chr(cp1).encode('utf-8'), chr(cp2).encode('utf-8')))

print(merges[:10])


e : 4137
 t: 2940
s : 2885
th: 2764
 a: 2214
[(b'e', b' '), (b' ', b't'), (b's', b' '), (b't', b'h'), (b' ', b'a'), (b'h', b'e'), (b'd', b' '), (b't', b' '), (b'i', b'n'), (b'n', b' ')]


In [59]:
codepoints2 = list(map(int, text.encode('utf-8')))