diff --git a/src/huggingface_tokenizer.cc b/src/huggingface_tokenizer.cc index 2136005..6cbe0d8 100644 --- a/src/huggingface_tokenizer.cc +++ b/src/huggingface_tokenizer.cc @@ -15,7 +15,11 @@ namespace tokenizers { */ class HFTokenizer : public Tokenizer { public: - explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) {} + explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) { + #ifdef COMPILE_WASM_RUNTIME + setenv("TOKENIZERS_PARALLELISM", "false", true); + #endif + } HFTokenizer(const HFTokenizer&) = delete; HFTokenizer(HFTokenizer&& other) { std::swap(other.handle_, handle_); } diff --git a/web/build.sh b/web/build.sh index e4e048c..ecedadf 100755 --- a/web/build.sh +++ b/web/build.sh @@ -5,7 +5,7 @@ rustup target add wasm32-unknown-emscripten mkdir -p build cd build -emcmake cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3" +emcmake cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3 -DCOMPILE_WASM_RUNTIME" emmake make tokenizers_cpp tokenizers_c sentencepiece-static -j8 cd .. diff --git a/web/tests/src/index.ts b/web/tests/src/index.ts index caaa37c..bb7b5bc 100644 --- a/web/tests/src/index.ts +++ b/web/tests/src/index.ts @@ -48,8 +48,21 @@ async function testLlamaTokenizer() { } } +// Without COMPILE_WASM_RUNTIME, this triggers parallel processing, leading to error +async function testBertTokenizer() { + console.log("Bert Tokenizer"); + const modelBuffer = await (await + fetch("https://huggingface.co/Snowflake/snowflake-arctic-embed-l/raw/main/tokenizer.json") + ).arrayBuffer(); + const tok = await Tokenizer.fromJSON(modelBuffer); + const text = "What is the capital of Canada?"; + const ids = tok.encode(text); + console.log(ids); +} + async function main() { await testJSONTokenizer() + await testBertTokenizer(); await testLlamaTokenizer() }