Skip to content

Commit

Permalink
Sync codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed May 13, 2024
1 parent 1b9faf2 commit 9d01e56
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

This is the changelog for the open source version of tiktoken.

## [v0.7.0]
- Support for `gpt-4o`
- Performance improvements

## [v0.6.0]
- Optimise regular expressions for a 20% performance improvement, thanks to @paplorinc!
- Add `text-embedding-3-*` models to `encoding_for_model`
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tiktoken"
version = "0.6.0"
version = "0.7.0"
edition = "2021"
rust-version = "1.57.0"

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "tiktoken"
version = "0.6.0"
version = "0.7.0"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
license = {file = "LICENSE"}
Expand Down Expand Up @@ -42,4 +42,3 @@ test-command = "pytest {project}/tests --import-mode=append"
[[tool.cibuildwheel.overrides]]
select = "*linux_aarch64"
test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""

2 changes: 2 additions & 0 deletions tiktoken/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# TODO: these will likely be replaced by an API endpoint
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
Expand All @@ -18,6 +19,7 @@

MODEL_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4o": "o200k_base",
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"gpt-3.5": "cl100k_base", # Common shorthand
Expand Down
30 changes: 30 additions & 0 deletions tiktoken_ext/openai_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,40 @@ def cl100k_base():
}


def o200k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
)
special_tokens = {
ENDOFTEXT: 199999,
ENDOFPROMPT: 200018,
}
# This regex could be made more efficient
pat_str = "|".join(
[
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
r"""\p{N}{1,3}""",
r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
r"""\s*[\r\n]+""",
r"""\s+(?!\S)""",
r"""\s+""",
]
)
return {
"name": "o200k_base",
"pat_str": pat_str,
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}


ENCODING_CONSTRUCTORS = {
"gpt2": gpt2,
"r50k_base": r50k_base,
"p50k_base": p50k_base,
"p50k_edit": p50k_edit,
"cl100k_base": cl100k_base,
"o200k_base": o200k_base,
}

0 comments on commit 9d01e56

Please sign in to comment.