feat: support o200k_base

pkoukk · May 15, 2024 · 175c72f · 175c72f
1 parent 475cdcd
commit 175c72f
Show file tree

Hide file tree

Showing 11 changed files with 127 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -185,8 +185,10 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 # Available Models
 | Model name                   | OpenAI models |
 | ---------------------------- | ------------- |
+| gpt-4o-*                     | o200k_base    |
 | gpt-4-*                      | cl100k_base   |
 | gpt-3.5-turbo-*              | cl100k_base   |
+| gpt-4o                       | o200k_base    |
 | gpt-4                        | cl100k_base   |
 | gpt-3.5-turbo                | cl100k_base   |
 | text-davinci-003             | p50k_base     |
@@ -252,5 +254,14 @@ Or maybe my benchmark method is not appropriate.
 
 If you have better benchmark method or if you want add your benchmark result, please feel free to submit a PR.
 
+For new `o200k_base` encoding, it seems slower than `cl100k_base`. tiktoken-go is slightly slower than tiktoken on the following benchmark.
+
+| name        | encoding | time/op | os         | cpu      | text                             | times  |
+| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ |
+| tiktoken-go | o200k_base | 108522 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | o200k_base | 70198 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken-go | cl100k_base | 94502 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | cl100k_base | 54642 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+
 # License
 [MIT](./LICENSE)
diff --git a/README_zh-hans.md b/README_zh-hans.md
@@ -235,5 +235,14 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 
 如果你有更好的测试方法，或者说你想添加在你机器上的测试结果，欢迎提PR。
 
+新的 `o200k_base` 编码, 看起来比 `cl100k_base` 慢. 在以下硬件上，tiktoken-go 比 tiktoken 略慢。
+
+| name        | encoding | time/op | os         | cpu      | text                             | times  |
+| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ |
+| tiktoken-go | o200k_base | 108522 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | o200k_base | 70198 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken-go | cl100k_base | 94502 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | cl100k_base | 54642 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+
 # License
 [MIT](./LICENSE)
diff --git a/benchmark_test.go b/benchmark_test.go
@@ -20,7 +20,7 @@ func BenchmarkEncoding(b *testing.B) {
 		panic(err)
 	}
 
-	tkm, err := EncodingForModel("gpt-4")
+	tkm, err := EncodingForModel("gpt-4o")
 	if err != nil {
 		panic(err)
 	}

diff --git a/doc/test_result.md b/doc/test_result.md
diff --git a/encoding.go b/encoding.go
@@ -2,6 +2,7 @@ package tiktoken
 
 import (
 	"errors"
+	"strings"
 	"sync"
 )
 
@@ -12,6 +13,7 @@ const FIM_SUFFIX string = "<|fim_suffix|>"
 const ENDOFPROMPT string = "<|endofprompt|>"
 
 const (
+	MODEL_O200K_BASE  string = "o200k_base"
 	MODEL_CL100K_BASE string = "cl100k_base"
 	MODEL_P50K_BASE   string = "p50k_base"
 	MODEL_P50K_EDIT   string = "p50k_edit"
@@ -20,6 +22,7 @@ const (
 
 var MODEL_TO_ENCODING = map[string]string{
 	// chat
+	"gpt-4o":        MODEL_O200K_BASE,
 	"gpt-4":         MODEL_CL100K_BASE,
 	"gpt-3.5-turbo": MODEL_CL100K_BASE,
 	// text
@@ -62,6 +65,7 @@ var MODEL_TO_ENCODING = map[string]string{
 
 var MODEL_PREFIX_TO_ENCODING = map[string]string{
 	// chat
+	"gpt-4o-":        MODEL_O200K_BASE,  // e.g., gpt-4o-2024-05-13, etc.
 	"gpt-4-":         MODEL_CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k
 	"gpt-3.5-turbo-": MODEL_CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc.
 }
@@ -98,6 +102,8 @@ func getEncoding(encodingName string) (*Encoding, error) {
 
 func initEncoding(encodingName string) (*Encoding, error) {
 	switch encodingName {
+	case MODEL_O200K_BASE:
+		return o200k_base()
 	case MODEL_CL100K_BASE:
 		return cl100k_base()
 	case MODEL_P50K_BASE:
@@ -111,6 +117,32 @@ func initEncoding(encodingName string) (*Encoding, error) {
 	}
 }
 
+func o200k_base() (*Encoding, error) {
+	ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken")
+	if err != nil {
+		return nil, err
+	}
+	special_tokens := map[string]int{
+		ENDOFTEXT:   199999,
+		ENDOFPROMPT: 200018,
+	}
+	pats := []string{
+		`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
+		`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
+		`\p{N}{1,3}`,
+		` ?[^\s\p{L}\p{N}]+[\r\n/]*`,
+		`\s*[\r\n]+`,
+		`\s+(?!\S)`,
+		`\s+`,
+	}
+	return &Encoding{
+		Name:           MODEL_O200K_BASE,
+		PatStr:         strings.Join(pats, "|"),
+		MergeableRanks: ranks,
+		SpecialTokens:  special_tokens,
+	}, nil
+}
+
 func cl100k_base() (*Encoding, error) {
 	ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
 	if err != nil {

diff --git a/test/benchmark.py b/test/benchmark.py
@@ -1,5 +1,4 @@
 import tiktoken as tk
-import requests
 import time
 
 def benchmark_test(text_list,enc):
@@ -8,16 +7,15 @@ def benchmark_test(text_list,enc):
     :return: None
     """
     start = time.perf_counter_ns()
-    for index in range(100000):
-        text = text_list[index]
+    n = 100000
+    for i in range(n):
+        text = text_list[i % len(text_list)]
         num_tokens = len(enc.encode(text))
     end = time.perf_counter_ns()
-    print('benchmark test: {} ns/op'.format((end - start)/100000))
+    print('benchmark test: {} ns/op'.format((end - start)/n))
 
 if __name__ == '__main__':
-    r = requests.get('https://unicode.org/udhr/assemblies/full_all.txt')
-    text_list = r.text.splitlines()
-    cursor = 0
-    enc=tk.get_encoding('cl100k_base')
-    benchmark_test(text_list,enc)
-
+    with open('/tmp/udhr.txt','r') as f:
+        text_list = f.readlines()
+        enc=tk.get_encoding('o200k_base')
+        benchmark_test(text_list,enc)
diff --git a/test/benchmark_test.go b/test/benchmark_test.go
@@ -1,32 +1,24 @@
 package main
 
 import (
-	"io"
 	"log"
-	"net/http"
+	"os"
 	"strings"
 	"testing"
 
 	"github.com/pkoukk/tiktoken-go"
 )
 
-func BenchmarkEncodingInFullLanguage(b *testing.B) {
-	// Universal Declaration of Human Rights in all languages
-	url := "https://unicode.org/udhr/assemblies/full_all.txt"
-	response, err := http.Get(url)
-	if err != nil {
-		log.Fatal(err)
-	}
-	defer response.Body.Close()
+// go test -benchmem -run=^$ -bench ^BenchmarkEncodingInFullLanguage$ -benchtime=100000x github.com/pkoukk/tiktoken-go/test
 
-	responseData, err := io.ReadAll(response.Body)
+func BenchmarkEncodingInFullLanguage(b *testing.B) {
+	data, err := os.ReadFile("/tmp/udhr.txt")
 	if err != nil {
 		log.Fatal(err)
 	}
 
-	responseString := string(responseData)
-	lines := strings.Split(responseString, "\n")
-	tkm, err := tiktoken.EncodingForModel("gpt-4")
+	lines := strings.Split(string(data), "\n")
+	tkm, err := tiktoken.EncodingForModel("gpt-4o")
 	lineCount := len(lines)
 	if err != nil {
 		log.Fatal(err)

diff --git a/test/get_udhr.py b/test/get_udhr.py
@@ -0,0 +1,27 @@
+import os
+import tarfile
+import urllib.request
+
+url = "http://research.ics.aalto.fi/cog/data/udhr/udhr_txt_20100325.tar.gz"
+file_name = "/tmp/udhr_txt_20100325.tar.gz"
+
+def download_file(url, file_name):
+    urllib.request.urlretrieve(url, file_name)
+
+def merge_files(source_dir, output_file):
+    with open(output_file, 'wb') as outfile:
+        for filename in os.listdir(source_dir):
+            if os.path.isfile(os.path.join(source_dir, filename)):
+                with open(os.path.join(source_dir, filename), 'rb') as infile:
+                    outfile.write(infile.read())
+                outfile.write(b'\n')
+
+def untar(dest, file_name):
+    with tarfile.open(file_name, "r:gz") as tar:
+        tar.extractall(path=dest)
+
+# download_file(url, file_name)
+
+untar('/tmp', file_name)
+
+merge_files('/tmp/udhr/txt', '/tmp/udhr.txt')
diff --git a/test/test.txt b/test/test.txt
@@ -1,3 +1,3 @@
 hallo world!,你好世界！,こんにちは世界！,안녕하세요 세계!,Привет мир!,¡Hola mundo!,Hallo Welt!,Bonjour le monde!,Ciao mondo!,Hej världen!,Hallo wereld!,Hallo verden!,Hallo wereld!,Hallo verden!
-gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001
-cl100k_base,p50k_base,r50k_base
+gpt-4o,gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001
+o200k_base,cl100k_base,p50k_base,r50k_base
diff --git a/test/token_num.go b/test/token_num.go
@@ -74,7 +74,7 @@ func getTokenByEncoding(text string, encoding string) (num_tokens int) {
 func testTokenByModel(textList []string, modelList []string) {
 	for i := 0; i < len(textList); i++ {
 		for j := 0; j < len(modelList); j++ {
-			fmt.Printf("text: %s, model: %s, token: %d \n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j]))
+			fmt.Printf("text: %s, model: %s, token: %d\n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j]))
 		}
 	}
 }
@@ -83,7 +83,7 @@ func testTokenByModel(textList []string, modelList []string) {
 func testTokenByEncoding(textList []string, encodingList []string) {
 	for i := 0; i < len(textList); i++ {
 		for j := 0; j < len(encodingList); j++ {
-			fmt.Printf("text: %s, encoding: %s, token: %d \n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j]))
+			fmt.Printf("text: %s, encoding: %s, token: %d\n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j]))
 		}
 	}
 }
diff --git a/test/token_num.py b/test/token_num.py
@@ -64,7 +64,5 @@ def test_token_by_encoding(text_list, encoding_list):
 if __name__ == '__main__':
     text_list, model_list, encoding_list = read_data_from_file('test/test.txt')
     test_token_by_model(text_list, model_list)
-    print("=====================================")
+    print("=========================================")
     test_token_by_encoding(text_list, encoding_list)
-
-