In [1]:
!pip install tiktoken -q

In [2]:
import tiktoken
import numpy as np

tokenizer = tiktoken.get_encoding("cl100k_base")

print(f"Vocabulary size: {tokenizer.n_vocab:,} tokens")

Vocabulary size: 100,277 tokens


In [3]:
text = "Hello world"
tokens = tokenizer.encode(text)

print(f"Text: '{text}'")
print(f"Tokens: {tokens}")
print(f"Number of tokens: {len(tokens)}")

print("\nToken breakdown:")
for token in tokens:
    print(f"  {token} â†’ '{tokenizer.decode([token])}'")

Text: 'Hello world'
Tokens: [9906, 1917]
Number of tokens: 2

Token breakdown:
  9906 â†’ 'Hello'
  1917 â†’ ' world'


In [5]:
examples = [
    "Hello world",
    "don't",
    "artificial intelligence",
    "I love AI",
    "supercalifragilisticexpialidocious",
    "ðŸš€",
    "cafÃ©",
    "    spaces    ",
]

print("How different texts get tokenized:\n")
for text in examples:
    tokens = tokenizer.encode(text)
    print(f"'{text}'")
    print(f"  â†’ {len(tokens)} tokens: {tokens}")

    pieces = [tokenizer.decode([t]) for t in tokens]
    print(f"  â†’ pieces: {pieces}")
    print()

How different texts get tokenized:

'Hello world'
  â†’ 2 tokens: [9906, 1917]
  â†’ pieces: ['Hello', ' world']

'don't'
  â†’ 2 tokens: [15357, 956]
  â†’ pieces: ['don', "'t"]

'artificial intelligence'
  â†’ 3 tokens: [472, 16895, 11478]
  â†’ pieces: ['art', 'ificial', ' intelligence']

'I love AI'
  â†’ 3 tokens: [40, 3021, 15592]
  â†’ pieces: ['I', ' love', ' AI']

'supercalifragilisticexpialidocious'
  â†’ 11 tokens: [13066, 3035, 278, 333, 4193, 321, 4633, 4683, 532, 307, 78287]
  â†’ pieces: ['sup', 'erc', 'al', 'if', 'rag', 'il', 'istic', 'exp', 'ial', 'id', 'ocious']

'ðŸš€'
  â†’ 3 tokens: [9468, 248, 222]
  â†’ pieces: ['ï¿½', 'ï¿½', 'ï¿½']

'cafÃ©'
  â†’ 2 tokens: [936, 59958]
  â†’ pieces: ['ca', 'fÃ©']

'    spaces    '
  â†’ 3 tokens: [262, 12908, 257]
  â†’ pieces: ['   ', ' spaces', '    ']



In [6]:
my_text = "my name is chaitanya"

tokens = tokenizer.encode(my_text)
print(f"Text: '{my_text}'")
print(f"Tokens: {tokens}")
print(f"Number of tokens: {len(tokens)}")
print(f"Pieces: {[tokenizer.decode([t]) for t in tokens]}")

Text: 'my name is chaitanya'
Tokens: [2465, 836, 374, 523, 1339, 25041]
Number of tokens: 6
Pieces: ['my', ' name', ' is', ' ch', 'ait', 'anya']


In [7]:
test_cases = {
    "English prose": "The quick brown fox jumps over the lazy dog.",
    "Python code": "def hello():\n    print('Hello, world!')",
    "JSON": '{"name": "Alice", "age": 30, "city": "NYC"}',
    "Numbers": "1234567890 9876543210 1111111111",
    "URL": "https://www.example.com/path/to/page?query=value",
}

print("Token efficiency comparison:\n")
for name, text in test_cases.items():
    tokens = tokenizer.encode(text)
    chars = len(text)
    ratio = chars / len(tokens)
    print(f"{name}:")
    print(f"  {chars} chars â†’ {len(tokens)} tokens ({ratio:.1f} chars/token)")
    print()

Token efficiency comparison:

English prose:
  44 chars â†’ 10 tokens (4.4 chars/token)

Python code:
  39 chars â†’ 11 tokens (3.5 chars/token)

JSON:
  43 chars â†’ 19 tokens (2.3 chars/token)

Numbers:
  32 chars â†’ 14 tokens (2.3 chars/token)

URL:
  48 chars â†’ 11 tokens (4.4 chars/token)



In [10]:
word = "strawberry"
tokens = tokenizer.encode(word)

print(f"Word: '{word}'")
print(f"Tokens: {tokens}")
print(f"Pieces: {[tokenizer.decode([t]) for t in tokens]}")
print()
print("The model sees these pieces, not individual letters!")
print("Counting 'r's requires looking INSIDE tokens â€” that's hard.")

Word: 'strawberry'
Tokens: [496, 675, 15717]
Pieces: ['str', 'aw', 'berry']

The model sees these pieces, not individual letters!
Counting 'r's requires looking INSIDE tokens â€” that's hard.


In [11]:
word = "hello"
tokens = tokenizer.encode(word)
print(f"'{word}' â†’ tokens: {[tokenizer.decode([t]) for t in tokens]}")
print()
print("If 'hello' is ONE token, the model can't easily reverse it.")
print("It would need to decompose something it sees as atomic.")

'hello' â†’ tokens: ['hello']

If 'hello' is ONE token, the model can't easily reverse it.
It would need to decompose something it sees as atomic.
