In [1]:
import tiktoken
import random

Encodings specify how text is converted into tokens. Different models use different encodings.

`tiktoken` supports three encodings used by OpenAI models:
    
| Encoding name           | OpenAI models                                       |
|-------------------------|-----------------------------------------------------|
| `cl100k_base`           | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`  |
| `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003`|
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                         |

In [2]:
encode_names = tiktoken.list_encoding_names()

print(encode_names)

['gpt2', 'r50k_base', 'p50k_base', 'p50k_edit', 'cl100k_base']


In [3]:
for encode_name in encode_names:
    encoding = tiktoken.get_encoding(encode_name)
    print(f'Encoding name: {encoding.name}, vocab size: {encoding.n_vocab}\n')

Encoding name: gpt2, vocab size: 50257

Encoding name: r50k_base, vocab size: 50257

Encoding name: p50k_base, vocab size: 50281

Encoding name: p50k_edit, vocab size: 50284

Encoding name: cl100k_base, vocab size: 100277



In [4]:
# or use tiktoken.encoding_for_model() to automatically load the correct encoding for a given model name
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [5]:
encoding = tiktoken.get_encoding('gpt2')

In [6]:
encoding.max_token_value + 1

50257

In [7]:
encoding.special_tokens_set

{'<|endoftext|>'}

In [8]:
encoding.eot_token

50256

Encode some text

In [9]:
encoded = encoding.encode("Wow! The tiktoken is great!")
print(encoded)

[22017, 0, 383, 256, 1134, 30001, 318, 1049, 0]


Convert the encoded back into text

In [10]:
encoding.decode(encoded)

'Wow! The tiktoken is great!'

In [11]:
print(encoding.encode("Great!"))
print(encoding.encode("great!"))

[13681, 0]
[18223, 0]


In [12]:
print(encoding.encode("This is\n a great\n\nnews!"))

[1212, 318, 198, 257, 1049, 198, 198, 10827, 0]


In [13]:
print(encoding.encode("\n\nP"))

[198, 198, 47]


In [14]:
for tk in [198, 628]:
    print(repr(encoding.decode([tk])))
    print(encoding.decode_single_token_bytes(tk) == b'\n')

'\n'
True
'\n\n'
False


In [15]:
enc_base = tiktoken.get_encoding("p50k_base")
max_token_value = enc_base.max_token_value + 1
print(max_token_value)

50281


In [16]:
# add custom special tokens
special_tokens_list = ['PAD', 'UNK', 'CLS', 'SEP', 'MASK', 'SOT', 'EOT']
special_tokens = {}

for i, tk in enumerate(special_tokens_list):
    special_tokens[f"<|{tk}|>"] = max_token_value+(1+i)

print(special_tokens)

{'<|PAD|>': 50282, '<|UNK|>': 50283, '<|CLS|>': 50284, '<|SEP|>': 50285, '<|MASK|>': 50286, '<|SOT|>': 50287, '<|EOT|>': 50288}


In [17]:
# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="p50k_custom",
    pat_str=enc_base._pat_str,
    mergeable_ranks=enc_base._mergeable_ranks,
    special_tokens={
        **enc_base._special_tokens,
        **special_tokens,
    }
)


print(enc.special_tokens_set)
print(enc.max_token_value + 1)

{'<|MASK|>', '<|endoftext|>', '<|EOT|>', '<|PAD|>', '<|CLS|>', '<|SOT|>', '<|UNK|>', '<|SEP|>'}
50289


In [18]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    # print the example string
    print(f'\nExample string: "{example_string}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in ["gpt2", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")
        

In [19]:
compare_encodings("This is\na great\n\nnews!")


Example string: "This is
a great

news!"

gpt2: 9 tokens
token integers: [1212, 318, 198, 64, 1049, 198, 198, 10827, 0]
token bytes: [b'This', b' is', b'\n', b'a', b' great', b'\n', b'\n', b'news', b'!']

p50k_base: 9 tokens
token integers: [1212, 318, 198, 64, 1049, 198, 198, 10827, 0]
token bytes: [b'This', b' is', b'\n', b'a', b' great', b'\n', b'\n', b'news', b'!']

cl100k_base: 8 tokens
token integers: [2028, 374, 198, 64, 2294, 271, 10189, 0]
token bytes: [b'This', b' is', b'\n', b'a', b' great', b'\n\n', b'news', b'!']


In [20]:
compare_encodings("So far everything is doing great!")


Example string: "So far everything is doing great!"

gpt2: 7 tokens
token integers: [2396, 1290, 2279, 318, 1804, 1049, 0]
token bytes: [b'So', b' far', b' everything', b' is', b' doing', b' great', b'!']

p50k_base: 7 tokens
token integers: [2396, 1290, 2279, 318, 1804, 1049, 0]
token bytes: [b'So', b' far', b' everything', b' is', b' doing', b' great', b'!']

cl100k_base: 7 tokens
token integers: [4516, 3117, 4395, 374, 3815, 2294, 0]
token bytes: [b'So', b' far', b' everything', b' is', b' doing', b' great', b'!']


In [21]:
compare_encodings("你好吗？")


Example string: "你好吗？"

gpt2: 9 tokens
token integers: [19526, 254, 25001, 121, 28938, 245, 171, 120, 253]
token bytes: [b'\xe4\xbd', b'\xa0', b'\xe5\xa5', b'\xbd', b'\xe5\x90', b'\x97', b'\xef', b'\xbc', b'\x9f']

p50k_base: 9 tokens
token integers: [19526, 254, 25001, 121, 28938, 245, 171, 120, 253]
token bytes: [b'\xe4\xbd', b'\xa0', b'\xe5\xa5', b'\xbd', b'\xe5\x90', b'\x97', b'\xef', b'\xbc', b'\x9f']

cl100k_base: 5 tokens
token integers: [57668, 53901, 7305, 245, 11571]
token bytes: [b'\xe4\xbd\xa0', b'\xe5\xa5\xbd', b'\xe5\x90', b'\x97', b'\xef\xbc\x9f']


In [22]:
compare_encodings("2 + 2 = 4")


Example string: "2 + 2 = 4"

gpt2: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']

p50k_base: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']

cl100k_base: 7 tokens
token integers: [17, 489, 220, 17, 284, 220, 19]
token bytes: [b'2', b' +', b' ', b'2', b' =', b' ', b'4']
