Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ set(tokenizers_source_files
${CMAKE_CURRENT_SOURCE_DIR}/src/re2_regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/tekken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp
)
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ Compatible with https://github.com/huggingface/tokenizers/.
## Llama2.c tokenizer
Adapted from https://github.com/karpathy/llama2.c.

## Tekken tokenizer
Mistral's Tekken tokenizer (v7) with full support for special tokens, multilingual text, and instruction-tuned conversations. Provides significant efficiency gains for AI workloads:
- **Special token recognition**: [INST], [/INST], [AVAILABLE_TOOLS], etc. as single tokens
- **Multilingual support**: Complete Unicode handling including emojis and complex scripts
- **Production-ready**: 100% decode accuracy with comprehensive test coverage
- **Python bindings**: Full compatibility with mistral-common ecosystem

## License

tokenizers is released under the [BSD 3 license](LICENSE). (Additional
Expand Down
101 changes: 101 additions & 0 deletions include/pytorch/tokenizers/tekken.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*
* @lint-ignore-every LICENSELINT
*/

#pragma once

#include <memory>
#include <optional>
#include <string>
#include <vector>

// Third Party
#include <nlohmann/json.hpp>

// Local
#include <pytorch/tokenizers/bpe_tokenizer_base.h>
#include <pytorch/tokenizers/error.h>
#include <pytorch/tokenizers/regex.h>
#include <pytorch/tokenizers/result.h>

namespace tokenizers {

class Tekken : public detail::BPETokenizerBase {
public:
struct TekkenConfig {
std::string pattern;
size_t num_vocab_tokens;
size_t default_vocab_size;
size_t default_num_special_tokens;
std::string version;
};

struct TokenInfo {
uint64_t rank;
std::string token_bytes; // Base64 encoded
std::optional<std::string> token_str;
};

struct SpecialTokenInfo {
uint64_t rank;
std::string token_str;
bool is_control;
};

explicit Tekken();

// Load from tekken.json file
Error load(const std::string& tokenizer_path) override;

// Support loading with explicit special tokens
Error load_with_special_tokens(
const std::string& tokenizer_path,
const std::vector<SpecialTokenInfo>& special_tokens);

// Get the version string
const std::string& get_version() const {
return _version;
}

protected:
// Virtual methods from BPETokenizerBase
Error _encode(
const std::string& input,
std::vector<uint64_t>& ret,
uint64_t& last_piece_token_len) const override;

void _decode(const std::string& input, std::string& ret) const override;

private:
// Parse the JSON configuration
Result<TekkenConfig> _parse_config(const nlohmann::json& j) const;

// Build token map from JSON vocab
Result<detail::TokenMap> _load_vocab_from_json(
const nlohmann::json& vocab_json,
size_t max_vocab) const;

// Initialize special tokens (fills up to num_special_tokens slots)
std::vector<SpecialTokenInfo> _initialize_special_tokens(
const std::vector<SpecialTokenInfo>& defined_tokens,
size_t num_special_tokens) const;

// Default Tekken pattern
static std::string _get_default_tekken_pattern();

// Default special tokens for Mistral models
static std::vector<SpecialTokenInfo> _get_default_special_tokens();

size_t _num_special_tokens = 1000; // Tekken reserves 1000 slots
std::string _version;
std::string _pattern;
std::unique_ptr<IRegex> _regex;
};

} // namespace tokenizers
51 changes: 51 additions & 0 deletions src/python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <pytorch/tokenizers/llama2c_tokenizer.h>
#include <pytorch/tokenizers/result.h>
#include <pytorch/tokenizers/sentencepiece.h>
#include <pytorch/tokenizers/tekken.h>
#include <pytorch/tokenizers/tiktoken.h>
#include <pytorch/tokenizers/tokenizer.h>

Expand Down Expand Up @@ -253,4 +254,54 @@ PYBIND11_MODULE(pytorch_tokenizers_cpp, m) {
return unwrap_result(self.decode(token, token));
},
py::arg("token"));

// Bind Tekken tokenizer
py::class_<Tekken, Tokenizer>(m, "Tekken")
.def(py::init<>())
.def(
"load",
[](Tekken& self, const std::string& tokenizer_path) {
Error error = self.load(tokenizer_path);
if (error != Error::Ok) {
throw std::runtime_error("Failed to load Tekken tokenizer");
}
},
py::arg("tokenizer_path"))
.def(
"encode",
[](const Tekken& self,
const std::string& input,
int8_t bos,
int8_t eos) {
return unwrap_result(self.encode(input, bos, eos));
},
py::arg("input"),
py::arg("bos") = 0,
py::arg("eos") = 0)
.def(
"decode",
[](const Tekken& self, uint64_t token) {
return unwrap_result(self.decode(token, token));
},
py::arg("token"))
.def(
"decode_batch",
[](const Tekken& self, const std::vector<uint64_t>& tokens) {
std::string result;
for (size_t i = 0; i < tokens.size(); ++i) {
uint64_t prev_token = (i == 0) ? 0 : tokens[i - 1];
auto decoded = self.decode(prev_token, tokens[i]);
if (decoded.error() != Error::Ok) {
throw std::runtime_error("Failed to decode token");
}
result += decoded.get();
}
return result;
},
py::arg("tokens"))
.def("vocab_size", &Tekken::vocab_size)
.def("bos_tok", &Tekken::bos_tok)
.def("eos_tok", &Tekken::eos_tok)
.def("is_loaded", &Tekken::is_loaded)
.def("get_version", &Tekken::get_version);
}
Loading