Skip to content

Commit

Permalink
Sync codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed Sep 13, 2023
1 parent 52fceb8 commit 39f29ce
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 11 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

This is the changelog for the open source version of tiktoken.

## [v0.5.1]
- Add `encoding_name_for_model`, undo some renames to variables that are implementation details

## [v0.5.0]
- Add `tiktoken._educational` submodule to better document how byte pair encoding works
- Ensure `encoding_for_model` knows about several new models
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tiktoken"
version = "0.5.0"
version = "0.5.1"
edition = "2021"
rust-version = "1.57.0"

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "tiktoken"
version = "0.5.0"
version = "0.5.1"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
license = {file = "LICENSE"}
Expand Down
2 changes: 2 additions & 0 deletions tiktoken/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# This is the public API of tiktoken
from .core import Encoding as Encoding
from .model import encoding_for_model as encoding_for_model
from .model import encoding_name_for_model as encoding_name_for_model
from .registry import get_encoding as get_encoding
from .registry import list_encoding_names as list_encoding_names
29 changes: 20 additions & 9 deletions tiktoken/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .registry import get_encoding

# TODO: these will likely be replaced by an API endpoint
_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
Expand All @@ -16,7 +16,7 @@
"ft:babbage-002": "cl100k_base",
}

_MODEL_TO_ENCODING: dict[str, str] = {
MODEL_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
Expand Down Expand Up @@ -64,23 +64,34 @@
}


def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model."""
def encoding_name_for_model(model_name: str) -> str:
"""Returns the name of the encoding used by a model.
Raises a KeyError if the model name is not recognised.
"""
encoding_name = None
if model_name in _MODEL_TO_ENCODING:
encoding_name = _MODEL_TO_ENCODING[model_name]
if model_name in MODEL_TO_ENCODING:
encoding_name = MODEL_TO_ENCODING[model_name]
else:
# Check if the model matches a known prefix
# Prefix matching avoids needing library updates for every model version release
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
if model_name.startswith(model_prefix):
return get_encoding(model_encoding_name)
return model_encoding_name

if encoding_name is None:
raise KeyError(
f"Could not automatically map {model_name} to a tokeniser. "
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
) from None

return get_encoding(encoding_name)
return encoding_name


def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model.
Raises a KeyError if the model name is not recognised.
"""
return get_encoding(encoding_name_for_model(model_name))

0 comments on commit 39f29ce

Please sign in to comment.