Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cmakelintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
filter=-convention/filename,-linelength,-package/consistency,-readability/logic,+readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[submodule "third-party/sentencepiece"]
path = third-party/sentencepiece
url = https://github.com/google/sentencepiece.git
[submodule "third-party/re2"]
path = third-party/re2
url = https://github.com/google/re2.git
[submodule "third-party/abseil-cpp"]
path = third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
21 changes: 14 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,40 @@ set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(third-party/abseil-cpp)
add_subdirectory(third-party/re2)
add_subdirectory(third-party/sentencepiece)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

add_library(tokenizers STATIC src/sentencepiece.cpp)
add_library(tokenizers STATIC src/sentencepiece.cpp src/tiktoken.cpp)

# Using abseil from sentencepiece/third_party
target_include_directories(tokenizers PUBLIC third-party/sentencepiece/src
third-party/sentencepiece include)
target_include_directories(
tokenizers PUBLIC third-party/sentencepiece/src third-party/sentencepiece
include third-party/re2)

target_link_libraries(tokenizers PUBLIC sentencepiece-static)
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)

# Build test
if(TOKENIZERS_BUILD_TEST)
include(FetchContent)
# CMAKE
FetchContent_Declare(
googletest
# Specify the commit you depend on and update it regularly.
URL https://github.com/google/googletest/archive/5376968f6948923e2411081fd9372e71a59d8e77.zip
)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
set(gtest_force_shared_crt
ON
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
add_executable(sentencepiece_test test/test_sentencepiece.cpp)
target_include_directories(
sentencepiece_test PUBLIC third-party/sentencepiece/src
third-party/sentencepiece include GTEST_INCLUDE_PATH)
sentencepiece_test
PUBLIC third-party/sentencepiece/src third-party/sentencepiece include
GTEST_INCLUDE_PATH)
target_link_libraries(sentencepiece_test PUBLIC tokenizers gtest_main)

# tiktoken tests
Expand Down
8 changes: 7 additions & 1 deletion include/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,20 @@ enum class Error : error_code_t {
/// Token out of range.
OutOfRange = 0x03,

/// Artifact load failure.
/// Tokenizer artifact load failure.
LoadFailure = 0x04,

/// Encode failure.
EncodeFailure = 0x05,

/// Base64 decode failure.
Base64DecodeFailure = 0x06,

/// Failed to parse tokenizer artifact.
ParseFailure = 0x07,

/// Decode failure.
DecodeFailure = 0x08,
};

} // namespace tokenizers
Expand Down
53 changes: 53 additions & 0 deletions include/result.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,56 @@ template <typename T> T *Result<T>::operator->() {
}

} // namespace tokenizers

/**
* Unwrap a Result to obtain its value. If the Result contains an error,
* propogate the error via trivial function return.
*
* Note: A function using TK_UNWRAP should itself return a Result or Error.
*
* @param[in] result__ Expression yielding the result to unwrap.
* @param[in] ... Optional format string for the log error message and its
* arguments.
*/
#define TK_UNWRAP(result__, ...) TK_INTERNAL_UNWRAP(result__, ##__VA_ARGS__)

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP(...) \
TK_INTERNAL_UNWRAP_SELECT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) \
(__VA_ARGS__)

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_SELECT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, \
...) \
TK_INTERNAL_UNWRAP_##N

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_1(result__) \
({ \
auto et_result__ = (result__); \
if (!et_result__.ok()) { \
return et_result__.error(); \
} \
std::move(*et_result__); \
})

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_2(result__, message__, ...) \
({ \
auto et_result__ = (result__); \
if (!et_result__.ok()) { \
TK_LOG(Error, message__, ##__VA_ARGS__); \
return et_result__.error(); \
} \
std::move(*et_result__); \
})

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_3 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_4 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_5 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_6 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_7 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_8 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_9 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_10 TK_INTERNAL_UNWRAP_2
65 changes: 43 additions & 22 deletions include/tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// Tiktoken header
// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
#include "re2/re2.h"
#include "result.h"
#include "tokenizer.h"
#include <cstdint>

Expand All @@ -20,9 +21,24 @@ using Re2UPtr = std::unique_ptr<re2::RE2>;

namespace tokenizers {

static constexpr int32_t kSpecialTokensSize = 256;
static constexpr size_t kBOSTokenIndex = 0;
static constexpr size_t kEOSTokenIndex = 1;

class Tiktoken : public Tokenizer {
public:
explicit Tiktoken();
explicit Tiktoken(std::unique_ptr<std::vector<std::string>> special_tokens,
size_t bos_token_index, size_t eos_token_index)
: _special_tokens(std::move(special_tokens)),
_bos_token_index(bos_token_index), _eos_token_index(eos_token_index) {
assert(_bos_token_index < _special_tokens->size());
assert(_eos_token_index < _special_tokens->size());
};

explicit Tiktoken()
: _special_tokens(_get_default_special_tokens()),
_bos_token_index(kBOSTokenIndex), _eos_token_index(kEOSTokenIndex){};

~Tiktoken() override;

Error load(const std::string &tokenizer_path) override;
Expand All @@ -34,37 +50,42 @@ class Tiktoken : public Tokenizer {
uint64_t token) const override;

private:
static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
Encoder special_tokens;
special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
special_tokens.emplace("<|eot_id|>", num_base_tokens++);
for (auto i = 5; i < 251; ++i) {
special_tokens.emplace("<|reserved_special_token_" + std::to_string(i) +
"|>",
num_base_tokens++);
static inline std::unique_ptr<std::vector<std::string>>
_get_default_special_tokens() {
auto special_tokens =
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
"<|begin_of_text|>", "<|end_of_text|>",
"<|reserved_special_token_0|>", "<|reserved_special_token_1|>",
"<|finetune_right_pad_id|>", "<|step_id|>", "<|start_header_id|>",
"<|end_header_id|>", "<|eom_id|>", "<|eot_id|>", "<|python_tag|>"});
// pad the rest of the special tokens with reserved tokens
ssize_t reserved_special_token_num = 2;
while (special_tokens->size() < kSpecialTokensSize) {
special_tokens->emplace_back(
"<|reserved_special_token_" +
std::to_string(reserved_special_token_num++) + "|>");
}
return special_tokens;
}

template <typename T>
std::pair<std::optional<std::string>, re2::StringPiece>
_split_with_allowed_special_token(re2::StringPiece &input,
const T &allowed_special);
const T &allowed_special) const;

void _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
uint64_t &last_piece_token_len);
Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
uint64_t &last_piece_token_len) const;

template <typename T>
std::pair<std::vector<uint64_t>, uint64_t>
_encode_with_special_token(const std::string &text, const T &allowed_special);
Result<std::pair<std::vector<uint64_t>, uint64_t>>
_encode_with_special_token(const std::string &text,
const T &allowed_special) const;

Encoder _build_special_token_encoder(ssize_t num_base_tokens) const;

std::unique_ptr<std::vector<std::string>> _special_tokens;
size_t _bos_token_index;
size_t _eos_token_index;

// Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
const std::string _pattern =
Expand Down
Loading