Skip to content

Commit

Permalink
feat: support o200k_base
Browse files Browse the repository at this point in the history
  • Loading branch information
WqyJh committed May 15, 2024
1 parent 475cdcd commit 175c72f
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 32 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,10 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
# Available Models
| Model name | OpenAI models |
| ---------------------------- | ------------- |
| gpt-4o-* | o200k_base |
| gpt-4-* | cl100k_base |
| gpt-3.5-turbo-* | cl100k_base |
| gpt-4o | o200k_base |
| gpt-4 | cl100k_base |
| gpt-3.5-turbo | cl100k_base |
| text-davinci-003 | p50k_base |
Expand Down Expand Up @@ -252,5 +254,14 @@ Or maybe my benchmark method is not appropriate.

If you have better benchmark method or if you want add your benchmark result, please feel free to submit a PR.

For new `o200k_base` encoding, it seems slower than `cl100k_base`. tiktoken-go is slightly slower than tiktoken on the following benchmark.

| name | encoding | time/op | os | cpu | text | times |
| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ |
| tiktoken-go | o200k_base | 108522 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
| tiktoken | o200k_base | 70198 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
| tiktoken-go | cl100k_base | 94502 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
| tiktoken | cl100k_base | 54642 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |

# License
[MIT](./LICENSE)
9 changes: 9 additions & 0 deletions README_zh-hans.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,5 +235,14 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string

如果你有更好的测试方法,或者说你想添加在你机器上的测试结果,欢迎提PR。

新的 `o200k_base` 编码, 看起来比 `cl100k_base` 慢. 在以下硬件上,tiktoken-go 比 tiktoken 略慢。

| name | encoding | time/op | os | cpu | text | times |
| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ |
| tiktoken-go | o200k_base | 108522 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
| tiktoken | o200k_base | 70198 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
| tiktoken-go | cl100k_base | 94502 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
| tiktoken | cl100k_base | 54642 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |

# License
[MIT](./LICENSE)
2 changes: 1 addition & 1 deletion benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func BenchmarkEncoding(b *testing.B) {
panic(err)
}

tkm, err := EncodingForModel("gpt-4")
tkm, err := EncodingForModel("gpt-4o")
if err != nil {
panic(err)
}
Expand Down
28 changes: 28 additions & 0 deletions doc/test_result.md

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package tiktoken

import (
"errors"
"strings"
"sync"
)

Expand All @@ -12,6 +13,7 @@ const FIM_SUFFIX string = "<|fim_suffix|>"
const ENDOFPROMPT string = "<|endofprompt|>"

const (
MODEL_O200K_BASE string = "o200k_base"
MODEL_CL100K_BASE string = "cl100k_base"
MODEL_P50K_BASE string = "p50k_base"
MODEL_P50K_EDIT string = "p50k_edit"
Expand All @@ -20,6 +22,7 @@ const (

var MODEL_TO_ENCODING = map[string]string{
// chat
"gpt-4o": MODEL_O200K_BASE,
"gpt-4": MODEL_CL100K_BASE,
"gpt-3.5-turbo": MODEL_CL100K_BASE,
// text
Expand Down Expand Up @@ -62,6 +65,7 @@ var MODEL_TO_ENCODING = map[string]string{

var MODEL_PREFIX_TO_ENCODING = map[string]string{
// chat
"gpt-4o-": MODEL_O200K_BASE, // e.g., gpt-4o-2024-05-13, etc.
"gpt-4-": MODEL_CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": MODEL_CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc.
}
Expand Down Expand Up @@ -98,6 +102,8 @@ func getEncoding(encodingName string) (*Encoding, error) {

func initEncoding(encodingName string) (*Encoding, error) {
switch encodingName {
case MODEL_O200K_BASE:
return o200k_base()
case MODEL_CL100K_BASE:
return cl100k_base()
case MODEL_P50K_BASE:
Expand All @@ -111,6 +117,32 @@ func initEncoding(encodingName string) (*Encoding, error) {
}
}

func o200k_base() (*Encoding, error) {
ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken")
if err != nil {
return nil, err
}
special_tokens := map[string]int{
ENDOFTEXT: 199999,
ENDOFPROMPT: 200018,
}
pats := []string{
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
`\p{N}{1,3}`,
` ?[^\s\p{L}\p{N}]+[\r\n/]*`,
`\s*[\r\n]+`,
`\s+(?!\S)`,
`\s+`,
}
return &Encoding{
Name: MODEL_O200K_BASE,
PatStr: strings.Join(pats, "|"),
MergeableRanks: ranks,
SpecialTokens: special_tokens,
}, nil
}

func cl100k_base() (*Encoding, error) {
ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
if err != nil {
Expand Down
18 changes: 8 additions & 10 deletions test/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import tiktoken as tk
import requests
import time

def benchmark_test(text_list,enc):
Expand All @@ -8,16 +7,15 @@ def benchmark_test(text_list,enc):
:return: None
"""
start = time.perf_counter_ns()
for index in range(100000):
text = text_list[index]
n = 100000
for i in range(n):
text = text_list[i % len(text_list)]
num_tokens = len(enc.encode(text))
end = time.perf_counter_ns()
print('benchmark test: {} ns/op'.format((end - start)/100000))
print('benchmark test: {} ns/op'.format((end - start)/n))

if __name__ == '__main__':
r = requests.get('https://unicode.org/udhr/assemblies/full_all.txt')
text_list = r.text.splitlines()
cursor = 0
enc=tk.get_encoding('cl100k_base')
benchmark_test(text_list,enc)

with open('/tmp/udhr.txt','r') as f:
text_list = f.readlines()
enc=tk.get_encoding('o200k_base')
benchmark_test(text_list,enc)
20 changes: 6 additions & 14 deletions test/benchmark_test.go
Original file line number Diff line number Diff line change
@@ -1,32 +1,24 @@
package main

import (
"io"
"log"
"net/http"
"os"
"strings"
"testing"

"github.com/pkoukk/tiktoken-go"
)

func BenchmarkEncodingInFullLanguage(b *testing.B) {
// Universal Declaration of Human Rights in all languages
url := "https://unicode.org/udhr/assemblies/full_all.txt"
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer response.Body.Close()
// go test -benchmem -run=^$ -bench ^BenchmarkEncodingInFullLanguage$ -benchtime=100000x github.com/pkoukk/tiktoken-go/test

responseData, err := io.ReadAll(response.Body)
func BenchmarkEncodingInFullLanguage(b *testing.B) {
data, err := os.ReadFile("/tmp/udhr.txt")
if err != nil {
log.Fatal(err)
}

responseString := string(responseData)
lines := strings.Split(responseString, "\n")
tkm, err := tiktoken.EncodingForModel("gpt-4")
lines := strings.Split(string(data), "\n")
tkm, err := tiktoken.EncodingForModel("gpt-4o")
lineCount := len(lines)
if err != nil {
log.Fatal(err)
Expand Down
27 changes: 27 additions & 0 deletions test/get_udhr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import tarfile
import urllib.request

url = "http://research.ics.aalto.fi/cog/data/udhr/udhr_txt_20100325.tar.gz"
file_name = "/tmp/udhr_txt_20100325.tar.gz"

def download_file(url, file_name):
urllib.request.urlretrieve(url, file_name)

def merge_files(source_dir, output_file):
with open(output_file, 'wb') as outfile:
for filename in os.listdir(source_dir):
if os.path.isfile(os.path.join(source_dir, filename)):
with open(os.path.join(source_dir, filename), 'rb') as infile:
outfile.write(infile.read())
outfile.write(b'\n')

def untar(dest, file_name):
with tarfile.open(file_name, "r:gz") as tar:
tar.extractall(path=dest)

# download_file(url, file_name)

untar('/tmp', file_name)

merge_files('/tmp/udhr/txt', '/tmp/udhr.txt')
4 changes: 2 additions & 2 deletions test/test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
hallo world!,你好世界!,こんにちは世界!,안녕하세요 세계!,Привет мир!,¡Hola mundo!,Hallo Welt!,Bonjour le monde!,Ciao mondo!,Hej världen!,Hallo wereld!,Hallo verden!,Hallo wereld!,Hallo verden!
gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001
cl100k_base,p50k_base,r50k_base
gpt-4o,gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001
o200k_base,cl100k_base,p50k_base,r50k_base
4 changes: 2 additions & 2 deletions test/token_num.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func getTokenByEncoding(text string, encoding string) (num_tokens int) {
func testTokenByModel(textList []string, modelList []string) {
for i := 0; i < len(textList); i++ {
for j := 0; j < len(modelList); j++ {
fmt.Printf("text: %s, model: %s, token: %d \n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j]))
fmt.Printf("text: %s, model: %s, token: %d\n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j]))
}
}
}
Expand All @@ -83,7 +83,7 @@ func testTokenByModel(textList []string, modelList []string) {
func testTokenByEncoding(textList []string, encodingList []string) {
for i := 0; i < len(textList); i++ {
for j := 0; j < len(encodingList); j++ {
fmt.Printf("text: %s, encoding: %s, token: %d \n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j]))
fmt.Printf("text: %s, encoding: %s, token: %d\n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j]))
}
}
}
4 changes: 1 addition & 3 deletions test/token_num.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,5 @@ def test_token_by_encoding(text_list, encoding_list):
if __name__ == '__main__':
text_list, model_list, encoding_list = read_data_from_file('test/test.txt')
test_token_by_model(text_list, model_list)
print("=====================================")
print("=========================================")
test_token_by_encoding(text_list, encoding_list)


0 comments on commit 175c72f

Please sign in to comment.