What Unicode character does chr(0) return? The chr(0) function returns the NULL character (U+0000).

In [3]:
chr(0)

'\x00'

How does this character’s string representation (__repr__()) differ from its printed representa-
tion? The string representation (__repr__()) of the NULL character shows it as '\x00', while its printed representation shows nothing at all.

In [4]:
chr(0).__repr__() # There are 6 characters in chr(0).__repr__()

"'\\x00'"

What happens when this character occurs in text? When the NULL character occurs in text, it often acts as a terminator or a placeholder and is typically not rendered visually, meaning it will appear as an invisible character or truncate the string in some environments.

In [5]:
chr(0)
print(chr(0))
"this is a test" + chr(0) + "string"
print("this is a test" + chr(0) + "string")

 
this is a test string


In [6]:
test_string = "hello! こんにちは!"
utf8_encoded = test_string.encode("utf-8")
print(utf8_encoded)
print(type(utf8_encoded))
list(utf8_encoded)
print(len(test_string))
print(len(utf8_encoded))
print(utf8_encoded.decode("utf-8"))

b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'
<class 'bytes'>
13
23
hello! こんにちは!


In [7]:
test_string = "hello! こんにちは!"
utf16_encoded = test_string.encode("utf-16")
print(utf16_encoded)
print(type(utf16_encoded))
list(utf16_encoded)
print(len(test_string))
print(len(utf16_encoded))
print(utf16_encoded.decode("utf-16"))

b'\xff\xfeh\x00e\x00l\x00l\x00o\x00!\x00 \x00S0\x930k0a0o0!\x00'
<class 'bytes'>
13
28
hello! こんにちは!


In [8]:
test_string = "hello! こんにちは!"
utf32_encoded = test_string.encode("utf-32")
print(utf32_encoded)
print(type(utf32_encoded))
list(utf32_encoded)
print(len(test_string))
print(len(utf32_encoded))
print(utf32_encoded.decode("utf-32"))

b'\xff\xfe\x00\x00h\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00o\x00\x00\x00!\x00\x00\x00 \x00\x00\x00S0\x00\x00\x930\x00\x00k0\x00\x00a0\x00\x00o0\x00\x00!\x00\x00\x00'
<class 'bytes'>
13
56
hello! こんにちは!


What are some reasons to prefer training our tokenizer on UTF-8 encoded bytes, rather than
UTF-16 or UTF-32? It may be helpful to compare the output of these encodings for various
input strings. 

What are some reasons to prefer training our tokenizer on UTF-8 encoded bytes, rather than
UTF-16 or UTF-32? It may be helpful to compare the output of these encodings for various
input strings.

Consider the following (incorrect) function, which is intended to decode a UTF-8 byte string into
a Unicode string. Why is this function incorrect? Provide an example of an input byte string
that yields incorrect results.

In [9]:
def decode_utf8_bytes_to_str_wrong(bytestring: bytes):
  return "".join([bytes([b]).decode("utf-8") for b in bytestring])

decode_utf8_bytes_to_str_wrong("hello".encode("utf-8"))

'hello'

In [10]:
An example input byte string for which decode_utf8_bytes_to_str_wrong pro-
duces incorrect output, with a one-sentence explanation of why the function is incorrect.

SyntaxError: invalid syntax (3311871793.py, line 1)

In [None]:
decode_utf8_bytes_to_str_wrong("は".encode("utf-8"))

Give a two byte sequence that does not decode to any Unicode character(s).

In [None]:
bytes([0, 128]).decode("utf-8")

In [14]:
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [15]:
# requires `regex` package
import regex as re
re.findall(PAT, "some text that i'll pre-tokenize")

['some', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenize']

In [None]:
import re, collections

def get_stats(vocab):
  pairs = collections.defaultdict(int)
  for word, freq in vocab.items():
    symbols = word.split()
    for i in range(len(symbols)-1):
      pairs[symbols[i],symbols[i+1]] += freq
  return pairs

def merge_vocab(pair, v_in):
  v_out = {}
  bigram = re.escape(' '.join(pair))
  p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
  for word in v_in:
    w_out = p.sub(''.join(pair), word)
    v_out[w_out] = v_in[word]
  return v_out

vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2,
'n e w e s t </w>':6, 'w i d e s t </w>':3}

num_merges = 10
for i in range(num_merges):
  pairs = get_stats(vocab)
  best = max(pairs, key=pairs.get)
  vocab = merge_vocab(best, vocab)
  print(best)

In [78]:
from collections import defaultdict

tokens: dict[tuple[bytes], int] = { ('l', 'o', 'w') :5, ('l', 'o', 'w', 'e', 'r') :2, ('w', 'i', 'd' , 'e' , 's' , 't'): 3, ('n', 'e', 'w' , 'e' , 's' , 't'): 6  }
vocab = ['<|endoftext|>'] + [chr(b) for b in range(256)]

def merge(tokens: dict[tuple[bytes], int], vocab: list[tuple[bytes]], merges: list[tuple[bytes, bytes]]):
  pair_dict: dict[tuple[tuple[bytes]], int] = defaultdict(int)
  for token, cnt in tokens.items():
    for i in range(1, len(token)):
      new_pair = (token[i-1], token[i])
      pair_dict[new_pair] = pair_dict[new_pair] + cnt

  max_cnt = 0
  max_pair = None
  for pair, cnt in pair_dict.items():
    if cnt > max_cnt or (cnt == max_cnt and pair > max_pair):
      max_cnt = cnt
      max_pair = pair

  print("!!!", max_pair)
  max_pair_concat = max_pair[0] + max_pair[1]  
  merges.append((max_pair[0], max_pair[1]))
  vocab[len(vocab)] = max_pair
  print("@@@", max_pair_concat)
  print("###", merges[-1])

  new_tokens: dict[tuple[tuple[bytes]], int]  = {}
  for token, cnt in tokens.items():
    new_token = []
    for i in range(len(token)):
        new_token.append(token[i])
        if len(new_token) >= 2 and (new_token[-2], new_token[-1]) == max_pair:
            new_token[-2:] = (max_pair_concat,)
    new_tokens[tuple(new_token)] = cnt

  return new_tokens


In [79]:
def get_token_count_pair(input: list[str]):
    token = defaultdict(int)
    for i in input:
        x = tuple([bytes(tuple([b])) for b in i])
        token[x] += 1
    return token

input = [ t.encode("utf-8") for t in ['u', ' don', "'t", ' have', 'u']]
merges = []
tokens = get_token_count_pair(input)
vocab = {}
tokens = merge(tokens, vocab, merges)
tokens


!!! (b'v', b'e')
@@@ b've'
### (b'v', b'e')


{(b'u',): 2,
 (b' ', b'd', b'o', b'n'): 1,
 (b"'", b't'): 1,
 (b' ', b'h', b'a', b've'): 1}

In [18]:
def split_by_sepcial_tokens(text: str, special_tokens: list[str]):
  escape_special_tokens = [re.escape(s) for s in special_tokens]
  return re.split("|".join(escape_special_tokens), text)

text = "[Doc 1]<|endoftext|>[Doc 2]<|endofline|>[Doc 1]<|endoftext|>[Doc 2]"
special_tokens = ["<|endoftext|>", "<|endofline|>"]
print(split_by_sepcial_tokens(text, special_tokens))

['[Doc 1]', '[Doc 2]', '[Doc 1]', '[Doc 2]']


In [19]:
def pre_tokenize_and_encode(docs: list[str]):
    PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
    result = []
    for doc in docs:
        splits = re.findall(PAT, doc)
        result += [s.encode("utf-8") for s in splits]
    return result


In [70]:
from pretokenization_example import find_chunk_boundaries

def train_bpe(input_path: str, vocab_size: int, special_tokens: list[str]):
  vocab: dict[int, bytes] = {}
  merges: list[tuple[bytes, bytes]] = []

  num_processes = 10
  with open(input_path, "rb") as f:
    boundaries = find_chunk_boundaries(
        f, num_processes, "<|endoftext|>".encode("utf-8")
    )
    f.seek(0)
    # read the first segments for now
    text = f.read(boundaries[1]).decode("utf-8")
    text_split_by_sepcial_tokens = split_by_sepcial_tokens(text, special_tokens)
    encoded_tokens = pre_tokenize_and_encode(text_split_by_sepcial_tokens)

    vocab[0] = '<|endoftext|>'
    for i in range(256):
        vocab[i+1] = chr(i)
    token_cnt_pairs = get_token_count_pair(encoded_tokens)

    num_steps = vocab_size - len(vocab)
    for i in range(num_steps):
      token_cnt_pairs = merge(token_cnt_pairs, vocab, merges)
  return vocab, merges

In [72]:
vocab, merges = train_bpe('../data/TinyStoriesV2-GPT4-valid.txt', 500, ["<|endoftext|>"])
print (merges)

[(b' ', b't'), (b'h', b'e'), (b' ', b'a'), (b' ', b's'), (b' ', b'w'), (b'n', b'd'), (b' t', b'he'), (b'e', b'd'), (b' ', b'b'), (b' t', b'o'), (b' a', b'nd'), (b' ', b'h'), (b' ', b'f'), (b'i', b'n'), (b' w', b'a'), (b' ', b'T'), (b'r', b'e'), (b'i', b't'), (b'o', b'u'), (b' ', b'l'), (b' ', b'd'), (b' ', b'c'), (b' ', b'p'), (b'a', b'y'), (b' wa', b's'), (b'e', b'r'), (b' ', b'm'), (b'o', b'm'), (b' ', b'he'), (b' T', b'he'), (b'i', b's'), (b' ', b'n'), (b'o', b'n'), (b' s', b'a'), (b'i', b'd'), (b'l', b'l'), (b'a', b'r'), (b'i', b'm'), (b' h', b'a'), (b'a', b't'), (b' ', b'g'), (b' ', b'S'), (b'o', b't'), (b'in', b'g'), (b'e', b'n'), (b'a', b'n'), (b'l', b'e'), (b'o', b'r'), (b'i', b'r'), (b'a', b'm'), (b' ', b'H'), (b'e', b't'), (b' t', b'h'), (b' ', b'it'), (b'i', b'g'), (b'i', b'l'), (b' The', b'y'), (b' H', b'e'), (b' ', b'in'), (b' ', b'"'), (b' p', b'l'), (b'v', b'er'), (b'o', b'w'), (b'r', b'i'), (b'u', b't'), (b' ', b'u'), (b' sa', b'id'), (b' d', b'ay'), (b'p', b'p'), (b'it