meta-pytorch · larryliu0820 · Dec 5, 2024 · Dec 5, 2024
diff --git a/.cmakelintrc b/.cmakelintrc
@@ -0,0 +1 @@
+filter=-convention/filename,-linelength,-package/consistency,-readability/logic,+readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,9 @@
 [submodule "third-party/sentencepiece"]
 	path = third-party/sentencepiece
 	url = https://github.com/google/sentencepiece.git
+[submodule "third-party/re2"]
+	path = third-party/re2
+	url = https://github.com/google/re2.git
+[submodule "third-party/abseil-cpp"]
+	path = third-party/abseil-cpp
+	url = https://github.com/abseil/abseil-cpp.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,33 +28,40 @@ set(ABSL_ENABLE_INSTALL ON)
 set(ABSL_PROPAGATE_CXX_STD ON)
 set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(third-party/abseil-cpp)
+add_subdirectory(third-party/re2)
 add_subdirectory(third-party/sentencepiece)
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
-add_library(tokenizers STATIC src/sentencepiece.cpp)
+add_library(tokenizers STATIC src/sentencepiece.cpp src/tiktoken.cpp)
 
 # Using abseil from sentencepiece/third_party
-target_include_directories(tokenizers PUBLIC third-party/sentencepiece/src
-                                            third-party/sentencepiece include)
+target_include_directories(
+  tokenizers PUBLIC third-party/sentencepiece/src third-party/sentencepiece
+                    include third-party/re2)
 
-target_link_libraries(tokenizers PUBLIC sentencepiece-static)
+target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
 
 # Build test
 if(TOKENIZERS_BUILD_TEST)
   include(FetchContent)
+  # CMAKE
   FetchContent_Declare(
     googletest
     # Specify the commit you depend on and update it regularly.
     URL https://github.com/google/googletest/archive/5376968f6948923e2411081fd9372e71a59d8e77.zip
   )
-  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  set(gtest_force_shared_crt
+      ON
+      CACHE BOOL "" FORCE)
   FetchContent_MakeAvailable(googletest)
 
   set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
   add_executable(sentencepiece_test test/test_sentencepiece.cpp)
   target_include_directories(
-    sentencepiece_test PUBLIC third-party/sentencepiece/src
-                              third-party/sentencepiece include GTEST_INCLUDE_PATH)
+    sentencepiece_test
+    PUBLIC third-party/sentencepiece/src third-party/sentencepiece include
+           GTEST_INCLUDE_PATH)
   target_link_libraries(sentencepiece_test PUBLIC tokenizers gtest_main)
 
   # tiktoken tests

diff --git a/include/error.h b/include/error.h
@@ -41,14 +41,20 @@ enum class Error : error_code_t {
   /// Token out of range.
   OutOfRange = 0x03,
 
-  /// Artifact load failure.
+  /// Tokenizer artifact load failure.
   LoadFailure = 0x04,
 
   /// Encode failure.
   EncodeFailure = 0x05,
 
   /// Base64 decode failure.
   Base64DecodeFailure = 0x06,
+
+  /// Failed to parse tokenizer artifact.
+  ParseFailure = 0x07,
+
+  /// Decode failure.
+  DecodeFailure = 0x08,
 };
 
 } // namespace tokenizers

diff --git a/include/result.h b/include/result.h
@@ -177,3 +177,56 @@ template <typename T> T *Result<T>::operator->() {
 }
 
 } // namespace tokenizers
+
+/**
+ * Unwrap a Result to obtain its value. If the Result contains an error,
+ * propogate the error via trivial function return.
+ *
+ * Note: A function using TK_UNWRAP should itself return a Result or Error.
+ *
+ * @param[in] result__ Expression yielding the result to unwrap.
+ * @param[in] ... Optional format string for the log error message and its
+ * arguments.
+ */
+#define TK_UNWRAP(result__, ...) TK_INTERNAL_UNWRAP(result__, ##__VA_ARGS__)
+
+// Internal only: Use TK_UNWRAP() instead.
+#define TK_INTERNAL_UNWRAP(...)                                                \
+  TK_INTERNAL_UNWRAP_SELECT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)        \
+  (__VA_ARGS__)
+
+// Internal only: Use TK_UNWRAP() instead.
+#define TK_INTERNAL_UNWRAP_SELECT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N,  \
+                                  ...)                                         \
+  TK_INTERNAL_UNWRAP_##N
+
+// Internal only: Use TK_UNWRAP() instead.
+#define TK_INTERNAL_UNWRAP_1(result__)                                         \
+  ({                                                                           \
+    auto et_result__ = (result__);                                             \
+    if (!et_result__.ok()) {                                                   \
+      return et_result__.error();                                              \
+    }                                                                          \
+    std::move(*et_result__);                                                   \
+  })
+
+// Internal only: Use TK_UNWRAP() instead.
+#define TK_INTERNAL_UNWRAP_2(result__, message__, ...)                         \
+  ({                                                                           \
+    auto et_result__ = (result__);                                             \
+    if (!et_result__.ok()) {                                                   \
+      TK_LOG(Error, message__, ##__VA_ARGS__);                                 \
+      return et_result__.error();                                              \
+    }                                                                          \
+    std::move(*et_result__);                                                   \
+  })
+
+// Internal only: Use TK_UNWRAP() instead.
+#define TK_INTERNAL_UNWRAP_3 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_4 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_5 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_6 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_7 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_8 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_9 TK_INTERNAL_UNWRAP_2
+#define TK_INTERNAL_UNWRAP_10 TK_INTERNAL_UNWRAP_2
diff --git a/include/tiktoken.h b/include/tiktoken.h
@@ -9,6 +9,7 @@
 // Tiktoken header
 // Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
 #include "re2/re2.h"
+#include "result.h"
 #include "tokenizer.h"
 #include <cstdint>
 
@@ -20,9 +21,24 @@ using Re2UPtr = std::unique_ptr<re2::RE2>;
 
 namespace tokenizers {
 
+static constexpr int32_t kSpecialTokensSize = 256;
+static constexpr size_t kBOSTokenIndex = 0;
+static constexpr size_t kEOSTokenIndex = 1;
+
 class Tiktoken : public Tokenizer {
 public:
-  explicit Tiktoken();
+  explicit Tiktoken(std::unique_ptr<std::vector<std::string>> special_tokens,
+                    size_t bos_token_index, size_t eos_token_index)
+      : _special_tokens(std::move(special_tokens)),
+        _bos_token_index(bos_token_index), _eos_token_index(eos_token_index) {
+    assert(_bos_token_index < _special_tokens->size());
+    assert(_eos_token_index < _special_tokens->size());
+  };
+
+  explicit Tiktoken()
+      : _special_tokens(_get_default_special_tokens()),
+        _bos_token_index(kBOSTokenIndex), _eos_token_index(kEOSTokenIndex){};
+
   ~Tiktoken() override;
 
   Error load(const std::string &tokenizer_path) override;
@@ -34,37 +50,42 @@ class Tiktoken : public Tokenizer {
                              uint64_t token) const override;
 
 private:
-  static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
-    Encoder special_tokens;
-    special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
-    special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
-    special_tokens.emplace("<|eot_id|>", num_base_tokens++);
-    for (auto i = 5; i < 251; ++i) {
-      special_tokens.emplace("<|reserved_special_token_" + std::to_string(i) +
-                                 "|>",
-                             num_base_tokens++);
+  static inline std::unique_ptr<std::vector<std::string>>
+  _get_default_special_tokens() {
+    auto special_tokens =
+        std::make_unique<std::vector<std::string>>(std::vector<std::string>{
+            "<|begin_of_text|>", "<|end_of_text|>",
+            "<|reserved_special_token_0|>", "<|reserved_special_token_1|>",
+            "<|finetune_right_pad_id|>", "<|step_id|>", "<|start_header_id|>",
+            "<|end_header_id|>", "<|eom_id|>", "<|eot_id|>", "<|python_tag|>"});
+    // pad the rest of the special tokens with reserved tokens
+    ssize_t reserved_special_token_num = 2;
+    while (special_tokens->size() < kSpecialTokensSize) {
+      special_tokens->emplace_back(
+          "<|reserved_special_token_" +
+          std::to_string(reserved_special_token_num++) + "|>");
     }
     return special_tokens;
   }
 
   template <typename T>
   std::pair<std::optional<std::string>, re2::StringPiece>
   _split_with_allowed_special_token(re2::StringPiece &input,
-                                    const T &allowed_special);
+                                    const T &allowed_special) const;
 
-  void _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
-               uint64_t &last_piece_token_len);
+  Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
+                uint64_t &last_piece_token_len) const;
 
   template <typename T>
-  std::pair<std::vector<uint64_t>, uint64_t>
-  _encode_with_special_token(const std::string &text, const T &allowed_special);
+  Result<std::pair<std::vector<uint64_t>, uint64_t>>
+  _encode_with_special_token(const std::string &text,
+                             const T &allowed_special) const;
+
+  Encoder _build_special_token_encoder(ssize_t num_base_tokens) const;
+
+  std::unique_ptr<std::vector<std::string>> _special_tokens;
+  size_t _bos_token_index;
+  size_t _eos_token_index;
 
   // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
   const std::string _pattern =
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		filter=-convention/filename,-linelength,-package/consistency,-readability/logic,+readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs