# Unicode Code Points

In [2]:
ord('h')

104

In [3]:
s = 'Hello world!'
[ord(c) for c in s]

[72, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 33]

In [7]:
list(s.encode('utf-8'))

[72, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 33]

# Tokenizer / LLM diagram

In [60]:
text = 'A Programmer’s Introduction to Unicode March 3, 2017 · Coding · 25 Comments Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception. A few months ago, I got interested in Unicode and decided to spend some time learning more about it in detail. In this article, I’ll give an introduction to it from a programmer’s point of view. I’m going to focus on the character set and what’s involved in working with strings and files of Unicode text. However, in this article I’m not going to talk about fonts, text layout/shaping/rendering, or localization in detail—those are separate issues, beyond my scope (and knowledge) here.'

tokens = text.encode('utf-8')
tokens = list(map(int, tokens))

In [61]:
def get_statistics(ids: list) -> dict:
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids: list, pair: tuple, index: int) -> list:
    # In the list of ids, replace each occurrence of the pair with the index
    new_ids = []
    i = 0
    while i < len(ids):
        if ids[i] == pair[0] and i + 1 < len(ids) and ids[i + 1] == pair[1]:
            new_ids.append(index)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    return new_ids

In [62]:
vocab_size = 276
num_merges = vocab_size - 256
ids = list(tokens)

merges = {}
for i in range(num_merges):
    stats = get_statistics(ids)
    top_pair = max(stats, key=stats.get)
    index = 256 + i
    print('Mering pair: {} into a new token {}'.format(top_pair, index))
    ids = merge(ids, top_pair, index)
    merges[top_pair] = index

Mering pair: (101, 32) into a new token 256
Mering pair: (105, 110) into a new token 257
Mering pair: (115, 32) into a new token 258
Mering pair: (226, 128) into a new token 259
Mering pair: (32, 116) into a new token 260
Mering pair: (240, 159) into a new token 261
Mering pair: (97, 110) into a new token 262
Mering pair: (97, 114) into a new token 263
Mering pair: (257, 103) into a new token 264
Mering pair: (116, 32) into a new token 265
Mering pair: (101, 114) into a new token 266
Mering pair: (100, 32) into a new token 267
Mering pair: (44, 32) into a new token 268
Mering pair: (111, 100) into a new token 269
Mering pair: (116, 105) into a new token 270
Mering pair: (111, 110) into a new token 271
Mering pair: (111, 114) into a new token 272
Mering pair: (259, 153) into a new token 273
Mering pair: (260, 104) into a new token 274
Mering pair: (85, 110) into a new token 275


In [63]:
len(ids)

911

In [64]:
merges

{(101, 32): 256,
 (105, 110): 257,
 (115, 32): 258,
 (226, 128): 259,
 (32, 116): 260,
 (240, 159): 261,
 (97, 110): 262,
 (97, 114): 263,
 (257, 103): 264,
 (116, 32): 265,
 (101, 114): 266,
 (100, 32): 267,
 (44, 32): 268,
 (111, 100): 269,
 (116, 105): 270,
 (111, 110): 271,
 (111, 114): 272,
 (259, 153): 273,
 (260, 104): 274,
 (85, 110): 275}

In [65]:
print('Token length: {}'.format(len(tokens)))
print('Ids length: {}'.format(len(ids)))
print('Compress ratio: {:.2f}X'.format(len(tokens) / len(ids)))

Token length: 1196
Ids length: 911
Compress ratio: 1.31X


# Decoding (int -> string)

In [79]:
vocab = {idx: bytes([idx]) for idx in range(256)}

for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

def decode(ids: list) -> str:
    tokens = b''.join(vocab[idx] for idx in ids)
    text = tokens.decode(encoding='utf-8', errors='replace')
    return text

In [94]:
print(decode([110]))

n


# Encoding (string -> int)

In [100]:
def encode(text: str) -> list[int]:
    tokens = list(text.encode(encoding='utf-8'))
    while len(tokens) > 1:
        stats = get_statistics(tokens)
        pair = min(stats, key=lambda x: merges.get(x, float('inf')))
        if pair not in merges:
            break
        idx = merges[pair]
        tokens = merge(tokens, pair, idx)
    return tokens

In [101]:
print(encode('h'))

[104]


In [103]:
print(decode(encode('Hello world')))

Hello world


In [104]:
text2 = decode(encode(text))
print(text2 == text)

True


# Forced splits using regex patterns (GPT series)

In [1]:
import regex as re

gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+'""")

print(re.findall(gpt2pat, 'hello how are you!!???'))
print(re.findall(gpt2pat, 'I\'m fine, THANK\'s        you121   '))

['hello', ' how', ' are', ' you', '!!???']
['I', "'m", ' fine', ',', ' THANK', "'s", '       ', ' you', '121', '   ']


In [34]:
example = """
for i in range(1, 101):
    if i % 3 == 0 and i % 5 == 0:
        print('FizzBuzz')
    elif i % 3 == 0:
        print('Fizz')
    elif i % 5 == 0:
        print('Buzz')
    else:
        print(i)
"""

print(re.findall(gpt2pat, example))

['for', ' i', ' in', ' range', '(', '1', ',', ' 101', '):', '\n   ', ' if', ' i', ' %', ' 3', ' ==', ' 0', ' and', ' i', ' %', ' 5', ' ==', ' 0', ':', '\n       ', ' print', "('", 'FizzBuzz', "')", '\n   ', ' elif', ' i', ' %', ' 3', ' ==', ' 0', ':', '\n       ', ' print', "('", 'Fizz', "')", '\n   ', ' elif', ' i', ' %', ' 5', ' ==', ' 0', ':', '\n       ', ' print', "('", 'Buzz', "')", '\n   ', ' else', ':', '\n       ', ' print', '(', 'i', ')', '\n']


# Different between GPT-2 and GPT-4 

## tiktoken library

In [1]:
import tiktoken

# GPT-2 (does not merge spaces)
enc = tiktoken.get_encoding('gpt2')
print(enc.encode('   hello world!!!'))

# GPT-4 (merges spaces)
enc = tiktoken.get_encoding('cl100k_base')
print(enc.encode('   hello world!!!'))

[220, 220, 23748, 995, 10185]
[256, 24748, 1917, 12340]


# Encoder of GPT-2

In [2]:
!wget https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/vocab.bpe
!wget https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/encoder.json

--2024-08-24 19:02:22--  https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/vocab.bpe
Resolving openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)... 57.150.97.129
Connecting to openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)|57.150.97.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 456318 (446K) [application/octet-stream]
Saving to: ‘vocab.bpe’


2024-08-24 19:02:26 (350 KB/s) - ‘vocab.bpe’ saved [456318/456318]

--2024-08-24 19:02:26--  https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/encoder.json
Resolving openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)... 57.150.97.129
Connecting to openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)|57.150.97.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1042301 (1018K) [application/json]
Saving to: ‘encoder.json’


2024-08-24 19:02:29 (588 KB/s) - ‘encoder.json’ saved [10423

In [6]:
import os, json

with open('encoder.json', 'r') as f:
    encoder = json.load(f) # ~ "vocab"

with open('vocab.bpe', 'r', encoding='utf-8') as f:
    bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] # ~ "merges"

In [7]:
print(len(encoder)) # 256 raw byte tokens + 50000 merge tokens + 1 special token ("<|endoftext|>")
print(len(bpe_merges))

50257
50000


## Special tokens

In [13]:
encoder['<|endoftext|>']

50256

## sentencepiece

In [3]:
import sentencepiece as spm
import os

In [2]:
with open('random.txt', 'w', encoding='utf-8') as f:
    f.write('SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.')

In [29]:
options = {
    'input': 'random.txt', # input file
    'input_format': 'text', # input format
    'model_prefix': 'tok4000', # output model prefix
    'model_type': 'bpe', # model type (BPE algorithm)
    'vocab_size': '400', # vocabulary size
    'normalization_rule_name': 'identity',
    'remove_extra_whitespaces': False,
    'input_sentence_size': 200000000, # max number of training sentences
    'max_sentence_length': 4192, # max number of bytes per sentence
    'seed_sentencepiece_size': 1000000,
    'shuffle_input_sentence': True,
    'character_coverage': 0.99995, # rare characters coverage
    'byte_fallback': True, # decompose unknown characters into utf-8 bytes pieces
    'split_digits': True, # merge rules for digits
    'split_by_unicode_script': True,
    'split_by_whitespace': True,
    'split_by_number': True,
    'max_sentencepiece_length': 16,
    'add_dummy_prefix': True,
    'allow_whitespace_only_pieces': True,
    'unk_id': 0, # special tokens ([UNK] - required), rest are optional
    'bos_id': 1,
    'eos_id': 2,
    'pad_id': -1,
    'num_threads': os.cpu_count(), # use all avaiable system resources
}

# Train SentencePiece model
spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: random.txt
  input_format: text
  model_prefix: tok4000
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  d

In [30]:
sp = spm.SentencePieceProcessor()
sp.load('tok4000.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [31]:
ids = sp.encode('hello world123', out_type=int)
print(ids)

[362, 378, 361, 372, 358, 313, 269, 372, 370, 52, 53, 54]


In [32]:
print([sp.id_to_piece(idx) for idx in ids])

['▁', 'h', 'e', 'l', 'lo', '▁w', 'or', 'l', 'd', '<0x31>', '<0x32>', '<0x33>']
