Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,59 @@ jobs:

# Run tests
pytest

unittest-windows:
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
with:
runner: windows.4xlarge
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
conda init powershell
powershell -Command "& {
Set-PSDebug -Trace 1
\$ErrorActionPreference = 'Stop'
\$PSNativeCommandUseErrorActionPreference = \$true

# Create a symbolic link to work around path length limitations. This gives a much shorter
# path, as the default checkout directory is deeply nested.
\$workingDir = \$PWD.Path
cd \$Env:GITHUB_WORKSPACE
New-Item -ItemType SymbolicLink -Path tk -Value \$workingDir
cd tk

# Run C++ unit tests
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test -T ClangCL
cmake --build build/test -j9 --config Debug
if (\$LASTEXITCODE -ne 0) {
Write-Host "Build was not successful. Exit code: \$LASTEXITCODE."
exit \$LASTEXITCODE
}

Push-Location build/test
ctest
if (\$LASTEXITCODE -ne 0) {
Write-Host "Unit tests were not successful. Exit code: \$LASTEXITCODE."
exit \$LASTEXITCODE
}
Pop-Location

conda create --yes --quiet -n tokenizers python=3.12
conda activate tokenizers

# Install tokenizers
pip install . -v
if (\$LASTEXITCODE -ne 0) {
Write-Host "Python installation was unsuccessful. Exit code: \$LASTEXITCODE."
exit \$LASTEXITCODE
}
pip install pytest blobfile transformers>=4.53.1

# Run python tests
pytest
if (\$LASTEXITCODE -ne 0) {
Write-Host "Python tests were not successful. Exit code: \$LASTEXITCODE."
Start-Sleep -Seconds 600 # Debug - keep alive to give time to SSH
exit \$LASTEXITCODE
}
}"
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,7 @@ pip-out/
*~
.~lock.*
*.idea

*.so
*.dylib
*.pyd
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ include(CMakePackageConfigHelpers)
include(Utils.cmake)

# Ignore weak attribute warning
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
endif()

set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
Expand All @@ -49,7 +51,7 @@ endif()

add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_BINARY_DIR}/sentencepiece-build
${CMAKE_CURRENT_BINARY_DIR}/sp-build
EXCLUDE_FROM_ALL
)

Expand Down
20 changes: 20 additions & 0 deletions include/pytorch/tokenizers/compiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

/**
* @file
* Compiler and platform-specific declarations.
*/

#ifdef _WIN32
#include <BaseTsd.h>
// ssize_t isn't available on Windows. Alias it to the Windows SSIZE_T value.
typedef SSIZE_T ssize_t;
#endif
1 change: 1 addition & 0 deletions include/pytorch/tokenizers/tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

// Local
#include <pytorch/tokenizers/bpe_tokenizer_base.h>
#include <pytorch/tokenizers/compiler.h>
#include <pytorch/tokenizers/regex.h>
#include <pytorch/tokenizers/result.h>
#include <pytorch/tokenizers/tokenizer.h>
Expand Down
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ class CMakeBuild(build_ext):
def build_extension(self, ext): # noqa C901
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))

# Ensure the extension goes into the pytorch_tokenizers package directory
extdir = os.path.join(extdir, "pytorch_tokenizers")

# Required for auto-detection & inclusion of auxiliary "native" libs
if not extdir.endswith(os.path.sep):
extdir += os.path.sep
Expand All @@ -55,6 +52,10 @@ def build_extension(self, ext): # noqa C901
]
build_args = ["--target", "pytorch_tokenizers_cpp"]

# Use Clang for Windows builds.
if sys.platform == "win32":
cmake_args += ["-T ClangCL"]

# Adding CMake arguments set as environment variable
# (needed e.g. to build for ARM OSX on conda-forge)
if "CMAKE_ARGS" in os.environ:
Expand Down Expand Up @@ -132,7 +133,7 @@ def build_extension(self, ext): # noqa C901
long_description_content_type="text/markdown",
url="https://github.com/meta-pytorch/tokenizers",
packages=find_packages(),
ext_modules=[CMakeExtension("pytorch_tokenizers_cpp")],
ext_modules=[CMakeExtension("pytorch_tokenizers.pytorch_tokenizers_cpp")],
cmdclass={"build_ext": CMakeBuild},
zip_safe=False,
python_requires=">=3.10",
Expand Down
4 changes: 2 additions & 2 deletions src/hf_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ Error HFTokenizer::load(const std::string& path) {
std::string model_config_json = "";
if (fs::is_directory(path)) {
const fs::path root(path);
model_json = root / "tokenizer.json";
model_json = (root / "tokenizer.json").string();
if (!fs::exists(model_json)) {
TK_LOG(Info, "no tokenizer.json found in %s", path.c_str());
return Error::LoadFailure;
}
const auto model_config_json_path = root / "tokenizer_config.json";
if (fs::exists(model_config_json_path)) {
model_config_json = model_config_json_path;
model_config_json = model_config_json_path.string();
}
}

Expand Down
8 changes: 8 additions & 0 deletions test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,11 @@ TEST_F(TiktokenTest, ConstructionWithInvalidBOSIndex) {
std::vector<std::string>{"<|end_of_text|>"}),
1,
0),
#if !GTEST_OS_WINDOWS
::testing::KilledBySignal(SIGABRT),
#else
[](int exit_code) { return exit_code != 0; },
#endif
"");
#endif
}
Expand All @@ -139,7 +143,11 @@ TEST_F(TiktokenTest, ConstructionWithInvalidEOSIndex) {
std::vector<std::string>{"<|begin_of_text|>"}),
0,
1),
#if !GTEST_OS_WINDOWS
::testing::KilledBySignal(SIGABRT),
#else
[](int exit_code) { return exit_code != 0; },
#endif
"");
#endif
}
Expand Down