In [2]:
from openai import OpenAI
client = OpenAI()

In [3]:
!pip install --upgrade tiktoken




In [4]:
import tiktoken

In [6]:
ecoding = tiktoken.get_encoding("cl100k_base")

In [7]:
dir(ecoding)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_core_bpe',
 '_encode_bytes',
 '_encode_only_native_bpe',
 '_encode_single_piece',
 '_mergeable_ranks',
 '_pat_str',
 '_special_tokens',
 'decode',
 'decode_batch',
 'decode_bytes',
 'decode_bytes_batch',
 'decode_single_token_bytes',
 'decode_tokens_bytes',
 'decode_with_offsets',
 'encode',
 'encode_batch',
 'encode_ordinary',
 'encode_ordinary_batch',
 'encode_single_token',
 'encode_with_unstable',
 'eot_token',
 'max_token_value',
 'n_vocab',
 'name',
 'special_tokens_set',
 'token_byte_values']

In [8]:
ecoding.name

'cl100k_base'

In [9]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [10]:
enc.name

'cl100k_base'

In [11]:
enc.encode("深度学习网络")

[85315, 109, 27479, 48864, 18259, 254, 72456]

In [12]:
enc.decode([85315, 109, 27479, 48864, 18259, 254, 72456])

'深度学习网络'

In [15]:
def num_tokens_from_string(encode_str:str,encoding_name="cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(encode_str))

In [16]:
num_tokens_from_string("long long ago")

3

In [18]:
enc.encode("to be or not to be")

[998, 387, 477, 539, 311, 387]

In [19]:
[enc.decode_single_token_bytes(t) for t in [998, 387, 477, 539, 311, 387]]

[b'to', b' be', b' or', b' not', b' to', b' be']

In [26]:
def compare_encodings(encode_str:str) -> None:
    print(f"\nExample str:'{encode_str}'")
    for enc_name in ["gpt2", "p50k_base", "cl100k_base"]:
        enc = tiktoken.get_encoding(enc_name)
        token_ints = enc.encode(encode_str)
        num_token = len(token_ints)
        token_bytes = [enc.decode_single_token_bytes(t) for t in token_ints]
        print("")
        print(f"{enc_name}: {num_token} tokens")
        print(f"token integers: {token_ints}")
        print(f"token bytes: {token_bytes}")

In [27]:
compare_encodings("antidisestablishmentarianism")


Example str:'antidisestablishmentarianism'

gpt2: 5 tokens
token integers: [415, 29207, 44390, 3699, 1042]
token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']

p50k_base: 5 tokens
token integers: [415, 29207, 44390, 3699, 1042]
token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']

cl100k_base: 6 tokens
token integers: [519, 85342, 34500, 479, 8997, 2191]
token bytes: [b'ant', b'idis', b'establish', b'ment', b'arian', b'ism']


In [28]:
compare_encodings("北京欢迎你")


Example str:'北京欢迎你'

gpt2: 11 tokens
token integers: [44293, 245, 12859, 105, 162, 105, 95, 32573, 236, 19526, 254]
token bytes: [b'\xe5\x8c', b'\x97', b'\xe4\xba', b'\xac', b'\xe6', b'\xac', b'\xa2', b'\xe8\xbf', b'\x8e', b'\xe4\xbd', b'\xa0']

p50k_base: 11 tokens
token integers: [44293, 245, 12859, 105, 162, 105, 95, 32573, 236, 19526, 254]
token bytes: [b'\xe5\x8c', b'\x97', b'\xe4\xba', b'\xac', b'\xe6', b'\xac', b'\xa2', b'\xe8\xbf', b'\x8e', b'\xe4\xbd', b'\xa0']

cl100k_base: 6 tokens
token integers: [70090, 25340, 95, 10287, 236, 57668]
token bytes: [b'\xe5\x8c\x97\xe4\xba\xac', b'\xe6\xac', b'\xa2', b'\xe8\xbf', b'\x8e', b'\xe4\xbd\xa0']


In [34]:
models = client.models.list()
models.data

[Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'),
 Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal'),
 Model(id='davinci-002', created=1692634301, object='model', owned_by='system'),
 Model(id='gpt-3.5-turbo-1106', created=1698959748, object='model', owned_by='system'),
 Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'),
 Model(id='gpt-3.5-turbo-16k', created=1683758102, object='model', owned_by='openai-internal'),
 Model(id='tts-1-hd-1106', created=1699053533, object='model', owned_by='system'),
 Model(id='tts-1-hd', created=1699046015, object='model', owned_by='system'),
 Model(id='gpt-4', created=1687882411, object='model', owned_by='openai'),
 Model(id='gpt-4-0613', created=1686588896, object='model', owned_by='openai'),
 Model(id='gpt-4-1106-preview', created=1698957206, object='model', owned_by='system'),
 Model(id='gpt-4-vision-preview', created=1698894917, object='model', owned_by='system'

In [60]:
def num_tokens_from_messages(messages:str, model="gpt-3.5-turbo"):
    #models = client.models.list()
    #current_models = [m.id for m in models]
    #print(current_models)
    try:
        enc= tiktoken.encoding_for_model(model)
    except (KeyError,ValueError) as e:
        # 如果模型没有找到，使用 cl100k_base 编码并给出警告
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0125",
        "gpt-4-turbo-2024-04-09",
    }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif "gpt-3.5-turbo" in model:
        # 对于 gpt-3.5-turbo 模型可能会有更新，此处返回假设为 gpt-3.5-turbo-0613 的token数量，并给出警告
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125")
    elif "gpt-4" in model:
        # 对于 gpt-4 模型可能会有更新，此处返回假设为 gpt-4-0613 的token数量，并给出警告
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-turbo-2024-04-09")
    else:
        # 对于没有实现的模型，抛出未实现错误
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    # 计算每条消息的token数
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(enc.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # 每条回复都以助手为首
    return num_tokens
    
        
        

In [66]:
example_messages = [
    {
        "role": "system",
        "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English.",
    }
]
for model in [
    "gpt-3.5-turbo",
    "gpt-4-turbo-2024-04-09",
    "gpt-4",
    ]:
    print(model)
    print(f"{num_tokens_from_messages(example_messages,model)} prompt tokens counted by num_tokens_from_messages().")
    

    complt = client.chat.completions.create(
        model=model,
        messages=example_messages,
        temperature=0,
        max_tokens=1,# we're only counting input tokens here, so let's not waste tokens on the output
    )
    print(f'{complt.usage.prompt_tokens} prompt tokens counted by the OpenAI API.')
    print("")

 

gpt-3.5-turbo
25 prompt tokens counted by num_tokens_from_messages().
25 prompt tokens counted by the OpenAI API.

gpt-4-turbo-2024-04-09
25 prompt tokens counted by num_tokens_from_messages().
25 prompt tokens counted by the OpenAI API.

gpt-4
25 prompt tokens counted by num_tokens_from_messages().
25 prompt tokens counted by the OpenAI API.

