From a80f6603432dc8a47c2b5fa76d11992e7ee844c1 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 19 Dec 2023 12:11:19 -0500 Subject: [PATCH] src: implement FastByteLengthUtf8 with simdutf::utf8_length_from_latin1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-URL: https://github.com/nodejs/node/pull/50840 Reviewed-By: Yagiz Nizipli Reviewed-By: Joyee Cheung Reviewed-By: James M Snell Reviewed-By: Vinícius Lourenço Claro Cardoso --- benchmark/buffers/buffer-bytelength-string.js | 5 ++++- src/node_buffer.cc | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/benchmark/buffers/buffer-bytelength-string.js b/benchmark/buffers/buffer-bytelength-string.js index fc0c005e7f9e6a..143da0215a613b 100644 --- a/benchmark/buffers/buffer-bytelength-string.js +++ b/benchmark/buffers/buffer-bytelength-string.js @@ -2,7 +2,8 @@ const common = require('../common'); const bench = common.createBenchmark(main, { - type: ['one_byte', 'two_bytes', 'three_bytes', 'four_bytes'], + type: ['one_byte', 'two_bytes', 'three_bytes', + 'four_bytes', 'latin1'], encoding: ['utf8', 'base64'], repeat: [1, 2, 16, 256], // x16 n: [4e6], @@ -14,6 +15,8 @@ const chars = { two_bytes: 'ΰαβγδεζηθικλμνξο', three_bytes: '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', four_bytes: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢', + latin1: 'Un homme sage est supérieur à toutes ' + + 'les insultes qui peuvent lui être adressées, et la meilleure réponse est la patience et la modération.', }; function getInput(type, repeat, encoding) { diff --git a/src/node_buffer.cc b/src/node_buffer.cc index ff041274f90d24..300060f9d24290 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -743,13 +743,17 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { uint32_t FastByteLengthUtf8(Local receiver, const v8::FastOneByteString& source) { - uint32_t result = 0; + // For short inputs, the function call overhead to simdutf is maybe + // not worth it, reserve simdutf for long strings. + if (source.length > 128) { + return simdutf::utf8_length_from_latin1(source.data, source.length); + } uint32_t length = source.length; + uint32_t result = length; const uint8_t* data = reinterpret_cast(source.data); for (uint32_t i = 0; i < length; ++i) { result += (data[i] >> 7); } - result += length; return result; }