## Tokenizing Lab

These are several experiments with the [**tiktoken**](https://github.com/openai/tiktoken) library.

In [12]:
import tiktoken

enc_legacy = tiktoken.get_encoding("cl100k_base")     # cl100k_base = GPT-4 / GPT-3.5
enc_modern = tiktoken.get_encoding("o200k_base")      # o200k_base  = GPT-4o / GPT-4o-mini

def run_tokenization(text, tokenizer):
    tokens = tokenizer.encode(text)
    print(tokens)
    print( [tokenizer.decode_single_token_bytes(t).decode('utf-8', errors='replace') for t in tokens])

In [13]:
# 1. The Multi-word test

text = "tiktoken is great!"
run_tokenization(text, enc_legacy)
run_tokenization(text, enc_modern)

[83, 1609, 5963, 374, 2294, 0]
['t', 'ik', 'token', ' is', ' great', '!']
[83, 8251, 2488, 382, 2212, 0]
['t', 'ikt', 'oken', ' is', ' great', '!']


In [14]:
# 2. The Case Sensitive test
text = "hello"
run_tokenization(text, enc_legacy)
run_tokenization(text, enc_modern)

text = "Hello"
run_tokenization(text, enc_legacy)
run_tokenization(text, enc_modern)

[15339]
['hello']
[24912]
['hello']
[9906]
['Hello']
[13225]
['Hello']


In [15]:
# 3. The Multilingual test (Huge difference here!)
text = "שלום"
run_tokenization(text, enc_legacy)
run_tokenization(text, enc_modern)


[59511, 50391, 37769, 251]
['ש', 'ל', 'ו�', '�']
[106154]
['שלום']
