From 4dfe04d974502778b017f05186ab7fcaf4f11f88 Mon Sep 17 00:00:00 2001 From: Nelson Chu Date: Mon, 15 May 2023 14:55:08 +0800 Subject: [PATCH] Added ability to use other source location fro bpe file --- tiktoken_ext/openai_public.py | 44 ++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 16a6ec50..83980674 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -1,4 +1,5 @@ from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe +import os ENDOFTEXT = "<|endoftext|>" FIM_PREFIX = "<|fim_prefix|>" @@ -8,9 +9,17 @@ def gpt2(): + vocab_bpe_file = os.environ.get( + "TIKTOKEN_BPE_FILE_GPT2_VOCAB", + "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", + ) + encoder_json_file = os.environ.get( + "TIKTOKEN_BPE_FILE_GPT2_ENCODER", + "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", + ) mergeable_ranks = data_gym_to_mergeable_bpe_ranks( - vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", - encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", + vocab_bpe_file=vocab_bpe_file, + encoder_json_file=encoder_json_file, ) return { "name": "gpt2", @@ -22,9 +31,11 @@ def gpt2(): def r50k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" + tiktoken_bpe_file = os.environ.get( + "TIKTOKEN_BPE_FILE_R50K_BASE", + "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", ) + mergeable_ranks = load_tiktoken_bpe(tiktoken_bpe_file) return { "name": "r50k_base", "explicit_n_vocab": 50257, @@ -35,9 +46,11 @@ def r50k_base(): def p50k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" + tiktoken_bpe_file = os.environ.get( + "TIKTOKEN_BPE_FILE_P50K_BASE", + "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", ) + mergeable_ranks = load_tiktoken_bpe(tiktoken_bpe_file) return { "name": "p50k_base", "explicit_n_vocab": 50281, @@ -48,10 +61,17 @@ def p50k_base(): def p50k_edit(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" + tiktoken_bpe_file = os.environ.get( + "TIKTOKEN_BPE_FILE_P50K_BASE", + "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", ) - special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} + mergeable_ranks = load_tiktoken_bpe(tiktoken_bpe_file) + special_tokens = { + ENDOFTEXT: 50256, + FIM_PREFIX: 50281, + FIM_MIDDLE: 50282, + FIM_SUFFIX: 50283, + } return { "name": "p50k_edit", "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", @@ -61,9 +81,11 @@ def p50k_edit(): def cl100k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" + tiktoken_bpe_file = os.environ.get( + "TIKTOKEN_BPE_FILE_CL100K_BASE", + "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", ) + mergeable_ranks = load_tiktoken_bpe(tiktoken_bpe_file) special_tokens = { ENDOFTEXT: 100257, FIM_PREFIX: 100258,