In [1]:
import tiktoken
from IPython.display import Markdown
MY_TOKENIZER = tiktoken.get_encoding("cl100k_base")

In [2]:
def tokenization(text, tokenizer=MY_TOKENIZER):
    encoded = tokenizer.encode(text)
    decoded = [tokenizer.decode([T]) for T in encoded]
    return decoded

In [4]:
def display_tokenization(text):
    tokens = tokenization(text)
    display(Markdown(' '.join([f'<span style="border: 1px solid orange; padding: 2px; margin: 2px;">{token.replace("<", "&lt;").replace(">", "&gt;")}</span>' if token != '\n' else '<br><br>' for token in tokens]) + f'<br><br><b>Number of tokens:</b> {len(tokens)}'))

In [5]:
display_tokenization("The quick brown fox jumps over the lazy dog")
display_tokenization("Hola, ¿cómo estás?")

<span style="border: 1px solid orange; padding: 2px; margin: 2px;">The</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> quick</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> brown</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> fox</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> jumps</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> over</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> the</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> lazy</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> dog</span><br><br><b>Number of tokens:</b> 9

<span style="border: 1px solid orange; padding: 2px; margin: 2px;">Hola</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;">,</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> ¿</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;">c</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;">ómo</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;"> est</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;">ás</span> <span style="border: 1px solid orange; padding: 2px; margin: 2px;">?</span><br><br><b>Number of tokens:</b> 8

In [6]:
# Full conversation length calculation.
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def chat_length(text, tokenizer=MY_TOKENIZER):
    if isinstance(text, str):
        text = [{"role": "user", "content": text}]

    num_tokens = 0
    for msg in text:
        num_tokens += 3
        for k, v in msg.items():
            num_tokens += len(tokenizer.encode(v)) + (1 if k == 'name' else 0)

    return num_tokens + 3

In [7]:
my_conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "Why should I switch to a plant-based diet?"
    }
]

display(f"{chat_length(my_conversation)} tokens")

'27 tokens'