Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Include CMake files
include CMakeLists.txt
include Utils.cmake
recursive-include cmake *

# Include C++ source files
recursive-include src *.cpp *.h *.hpp
recursive-include include *.h *.hpp

# Include third-party dependencies
recursive-include third-party *

# Include test files
recursive-include test *.py *.cpp *.h

# Include examples
recursive-include examples *

# Include documentation
include README.md
include LICENSE
include CONTRIBUTING.md
include CODE_OF_CONDUCT

# Include other necessary files
include pyproject.toml
include pytest.ini
include TARGETS
include targets.bzl

# Exclude build artifacts and unnecessary files
global-exclude *.pyc
global-exclude *.pyo
global-exclude __pycache__
global-exclude .git*
global-exclude .DS_Store
prune build
prune dist
prune *.egg-info
41 changes: 37 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
import re
import subprocess
import sys
import shutil
from pathlib import Path

from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
from setuptools.command.build_py import build_py as build_py_orig

# Read the README file
with open("README.md", "r") as f:
Expand Down Expand Up @@ -126,19 +128,52 @@ def build_extension(self, ext): # noqa C901
)


class BuildPy(build_py_orig):
"""Ensure header files are copied into the package during build."""

def run(self):
super().run()
headers_src = Path("include")
if not headers_src.exists():
return

headers_dst = Path(self.build_lib) / "pytorch_tokenizers" / "include"
for file_path in headers_src.rglob("*"):
if file_path.is_file():
destination = headers_dst / file_path.relative_to(headers_src)
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, destination)


setup(
name="pytorch-tokenizers",
version="0.1.0",
version="1.0.1",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/meta-pytorch/tokenizers",
packages=find_packages(),
include_package_data=True,
package_data={
"pytorch_tokenizers": [
"include/*.h",
"include/**/*.h",
"include/*.hpp",
"include/**/*.hpp",
]
},
ext_modules=[CMakeExtension("pytorch_tokenizers.pytorch_tokenizers_cpp")],
cmdclass={"build_ext": CMakeBuild},
cmdclass={
"build_ext": CMakeBuild,
"build_py": BuildPy,
},
zip_safe=False,
python_requires=">=3.10",
install_requires=[
"pybind11>=2.6.0",
"sentencepiece",
"mistral-common",
"tokenizers",
"tiktoken",
],
setup_requires=[
"pybind11>=2.6.0",
Expand All @@ -150,8 +185,6 @@ def build_extension(self, ext): # noqa C901
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
Expand Down
5 changes: 3 additions & 2 deletions src/re2_regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ Error Re2Regex::compile(const std::string& pattern) {
if (regex_->ok()) {
return Error::Ok;
} else {
// It should log using Error level but it's too confusing.
TK_LOG(
Error,
"Failed to compile regex: %s, error: %s",
Info,
"Re2 failed to compile regex: %s, error: %s\nThis may be ok if a fallback regex is used.",
pattern.c_str(),
regex_->error().c_str());
return Error::RegexFailure;
Expand Down