diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..eda677d --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,39 @@ +# Include CMake files +include CMakeLists.txt +include Utils.cmake +recursive-include cmake * + +# Include C++ source files +recursive-include src *.cpp *.h *.hpp +recursive-include include *.h *.hpp + +# Include third-party dependencies +recursive-include third-party * + +# Include test files +recursive-include test *.py *.cpp *.h + +# Include examples +recursive-include examples * + +# Include documentation +include README.md +include LICENSE +include CONTRIBUTING.md +include CODE_OF_CONDUCT + +# Include other necessary files +include pyproject.toml +include pytest.ini +include TARGETS +include targets.bzl + +# Exclude build artifacts and unnecessary files +global-exclude *.pyc +global-exclude *.pyo +global-exclude __pycache__ +global-exclude .git* +global-exclude .DS_Store +prune build +prune dist +prune *.egg-info diff --git a/setup.py b/setup.py index 63ced6c..6b7f4fc 100644 --- a/setup.py +++ b/setup.py @@ -10,10 +10,12 @@ import re import subprocess import sys +import shutil from pathlib import Path from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext +from setuptools.command.build_py import build_py as build_py_orig # Read the README file with open("README.md", "r") as f: @@ -126,19 +128,52 @@ def build_extension(self, ext): # noqa C901 ) +class BuildPy(build_py_orig): + """Ensure header files are copied into the package during build.""" + + def run(self): + super().run() + headers_src = Path("include") + if not headers_src.exists(): + return + + headers_dst = Path(self.build_lib) / "pytorch_tokenizers" / "include" + for file_path in headers_src.rglob("*"): + if file_path.is_file(): + destination = headers_dst / file_path.relative_to(headers_src) + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(file_path, destination) + + setup( name="pytorch-tokenizers", - version="0.1.0", + version="1.0.1", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/meta-pytorch/tokenizers", packages=find_packages(), + include_package_data=True, + package_data={ + "pytorch_tokenizers": [ + "include/*.h", + "include/**/*.h", + "include/*.hpp", + "include/**/*.hpp", + ] + }, ext_modules=[CMakeExtension("pytorch_tokenizers.pytorch_tokenizers_cpp")], - cmdclass={"build_ext": CMakeBuild}, + cmdclass={ + "build_ext": CMakeBuild, + "build_py": BuildPy, + }, zip_safe=False, python_requires=">=3.10", install_requires=[ "pybind11>=2.6.0", + "sentencepiece", + "mistral-common", + "tokenizers", + "tiktoken", ], setup_requires=[ "pybind11>=2.6.0", @@ -150,8 +185,6 @@ def build_extension(self, ext): # noqa C901 "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", diff --git a/src/re2_regex.cpp b/src/re2_regex.cpp index c62c5f7..c1269c5 100644 --- a/src/re2_regex.cpp +++ b/src/re2_regex.cpp @@ -19,9 +19,10 @@ Error Re2Regex::compile(const std::string& pattern) { if (regex_->ok()) { return Error::Ok; } else { + // It should log using Error level but it's too confusing. TK_LOG( - Error, - "Failed to compile regex: %s, error: %s", + Info, + "Re2 failed to compile regex: %s, error: %s\nThis may be ok if a fallback regex is used.", pattern.c_str(), regex_->error().c_str()); return Error::RegexFailure;