From 9b90337228638fbe9bc103cbe419023fea7c670e Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Fri, 15 Nov 2024 11:08:21 -0500 Subject: [PATCH] [Rust] Bump huggingface tokenizer to 0.20.0 This PR bumps the huggignface tokenizer dependency to version 0.20.0 to address the tokenizer issue in some latest models with latest trained tokenizers. --- rust/Cargo.toml | 2 +- rust/src/lib.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 9e1b44e..0758441 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -8,6 +8,6 @@ crate-type = ["staticlib"] [dependencies] -tokenizers = { version = "0.19.1", default-features = false, features = ["onig"] } +tokenizers = { version = "0.20.0", default-features = false, features = ["onig"] } serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 98ce523..96b0ea8 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -77,8 +77,8 @@ impl TokenizerWrapper { ); let mut tokenizer = Tokenizer::new(BPE::new(vocab, merges)); tokenizer - .with_pre_tokenizer(byte_level) - .with_decoder(byte_level); + .with_pre_tokenizer(Some(byte_level)) + .with_decoder(Some(byte_level)); TokenizerWrapper { tokenizer: tokenizer, decode_str: String::new(),