diff --git a/Cargo.toml b/Cargo.toml index 5ece1e8..f192aca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,13 +14,13 @@ python = [ ] [dependencies] -pyo3 = { version = "0.26", default-features = false, features = [ +pyo3 = { version = "0.26.0", default-features = false, features = [ "extension-module", "macros", ], optional = true } # tiktoken dependencies -fancy-regex = "0.16" +fancy-regex = "0.13.0" regex = "1.10.3" rustc-hash = "2" bstr = "1.5.0" diff --git a/pyproject.toml b/pyproject.toml index 76fcd11..17c7d28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,6 @@ skip = [ "*-manylinux_i686", "*-musllinux_i686", "*-win32", - "*-musllinux_aarch64", ] macos.archs = ["x86_64", "arm64"] # When cross-compiling on Intel, it is not possible to test arm64 wheels. diff --git a/src/py.rs b/src/py.rs index ee8c52d..60b6245 100644 --- a/src/py.rs +++ b/src/py.rs @@ -28,7 +28,7 @@ impl CoreBPE { #[pyo3(name = "encode_ordinary")] fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec { - py.allow_threads(|| self.encode_ordinary(text)) + py.detach(|| self.encode_ordinary(text)) } #[pyo3(name = "encode")] @@ -38,7 +38,7 @@ impl CoreBPE { text: &str, allowed_special: HashSet, ) -> PyResult> { - py.allow_threads(|| { + py.detach(|| { let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_ref()).collect(); match self.encode(text, &allowed_special) { @@ -54,7 +54,7 @@ impl CoreBPE { text: &str, allowed_special: HashSet, ) -> PyResult> { - let tokens_res = py.allow_threads(|| { + let tokens_res = py.detach(|| { let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_ref()).collect(); self.encode(text, &allowed_special) @@ -70,7 +70,7 @@ impl CoreBPE { } fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec { - py.allow_threads(|| { + py.detach(|| { match std::str::from_utf8(bytes) { // Straightforward case Ok(text) => self.encode_ordinary(text), @@ -121,7 +121,7 @@ impl CoreBPE { text: &str, allowed_special: HashSet, ) -> PyResult<(Vec, Py)> { - let (tokens, completions): (Vec, HashSet>) = py.allow_threads(|| { + let (tokens, completions): (Vec, HashSet>) = py.detach(|| { let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_ref()).collect(); self._encode_unstable_native(text, &allowed_special) @@ -155,7 +155,7 @@ impl CoreBPE { #[pyo3(name = "decode_bytes")] fn py_decode_bytes(&self, py: Python, tokens: Vec) -> Result, PyErr> { - match py.allow_threads(|| self.decode_bytes(&tokens)) { + match py.detach(|| self.decode_bytes(&tokens)) { Ok(bytes) => Ok(PyBytes::new(py, &bytes).into()), Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", e))), } diff --git a/tiktoken/core.py b/tiktoken/core.py index 42c426c..225fffb 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -4,11 +4,11 @@ from concurrent.futures import ThreadPoolExecutor from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence -import regex - from tiktoken import _tiktoken if TYPE_CHECKING: + import re + import numpy as np import numpy.typing as npt @@ -391,6 +391,9 @@ def _encode_single_piece(self, text_or_bytes: str | bytes) -> list[int]: def _encode_only_native_bpe(self, text: str) -> list[int]: """Encodes a string into tokens, but do regex splitting in Python.""" + # We need specifically `regex` in order to compile pat_str due to e.g. \p + import regex + _unused_pat = regex.compile(self._pat_str) ret = [] for piece in regex.findall(_unused_pat, text): @@ -423,9 +426,13 @@ def __setstate__(self, value: object) -> None: @functools.lru_cache(maxsize=128) -def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]": - inner = "|".join(regex.escape(token) for token in tokens) - return regex.compile(f"({inner})") +def _special_token_regex(tokens: frozenset[str]) -> re.Pattern[str]: + try: + import regex as re + except ImportError: + import re + inner = "|".join(re.escape(token) for token in tokens) + return re.compile(f"({inner})") def raise_disallowed_special_token(token: str) -> NoReturn: diff --git a/tiktoken/load.py b/tiktoken/load.py index 9a5cc87..dc2eba6 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -6,22 +6,26 @@ def read_file(blobpath: str) -> bytes: - if not blobpath.startswith("http://") and not blobpath.startswith("https://"): - try: - import blobfile - except ImportError as e: - raise ImportError( - "blobfile is not installed. Please install it by running `pip install blobfile`." - ) from e - with blobfile.BlobFile(blobpath, "rb") as f: + if "://" not in blobpath: + with open(blobpath, "rb", buffering=0) as f: return f.read() - # avoiding blobfile for public files helps avoid auth issues, like MFA prompts. - import requests + if blobpath.startswith(("http://", "https://")): + # avoiding blobfile for public files helps avoid auth issues, like MFA prompts. + import requests + + resp = requests.get(blobpath) + resp.raise_for_status() + return resp.content - resp = requests.get(blobpath) - resp.raise_for_status() - return resp.content + try: + import blobfile + except ImportError as e: + raise ImportError( + "blobfile is not installed. Please install it by running `pip install blobfile`." + ) from e + with blobfile.BlobFile(blobpath, "rb") as f: + return f.read() def check_hash(data: bytes, expected_hash: str) -> bool: @@ -49,7 +53,7 @@ def read_file_cached(blobpath: str, expected_hash: str | None = None) -> bytes: cache_path = os.path.join(cache_dir, cache_key) if os.path.exists(cache_path): - with open(cache_path, "rb") as f: + with open(cache_path, "rb", buffering=0) as f: data = f.read() if expected_hash is None or check_hash(data, expected_hash): return data