diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 4e9a9b66a63a7..f4e2a62d14b31 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -81,9 +81,8 @@ function(_get_compile_options_from_config output_var) list(APPEND config_options "-DLIBC_QSORT_IMPL=${LIBC_CONF_QSORT_IMPL}") endif() - if(LIBC_CONF_STRING_UNSAFE_WIDE_READ) - list(APPEND config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ") - endif() + list(APPEND config_options "-DLIBC_COPT_STRING_LENGTH_IMPL=${LIBC_CONF_STRING_LENGTH_IMPL}") + list(APPEND config_options "-DLIBC_COPT_FIND_FIRST_CHARACTER_IMPL=${LIBC_CONF_FIND_FIRST_CHARACTER_IMPL}") if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING) list(APPEND config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING") diff --git a/libc/config/config.json b/libc/config/config.json index cfbe9a43948ea..12596a00911e2 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -40,6 +40,7 @@ "value": false, "doc": "Use an alternative printf float implementation based on 320-bit floats" }, + "LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": { "value": false, "doc": "Disable printing fixed point values in printf and friends." @@ -64,9 +65,13 @@ } }, "string": { - "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { - "value": false, - "doc": "Read more than a byte at a time to perform byte-string operations like strlen." + "LIBC_CONF_STRING_LENGTH_IMPL": { + "value": "element", + "doc": "Selects the implementation for string-length: 'element', 'wide', 'generic' (vector), or 'arch'." + }, + "LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": { + "value": "element", + "doc": "Selects the implementation for find-first-character-related functions: 'element', 'wide', 'generic' (vector), or 'arch'." }, "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": { "value": false, diff --git a/libc/config/linux/arm/config.json b/libc/config/linux/arm/config.json index e7ad4544b104d..caa16744d389f 100644 --- a/libc/config/linux/arm/config.json +++ b/libc/config/linux/arm/config.json @@ -1,7 +1,10 @@ { "string": { - "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { - "value": false + "LIBC_CONF_STRING_LENGTH_IMPL": { + "value": "element" + } + "LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": { + "value": "element" } } } diff --git a/libc/config/linux/config.json b/libc/config/linux/config.json index 30e8b2cdadabe..93f5a1ef1f184 100644 --- a/libc/config/linux/config.json +++ b/libc/config/linux/config.json @@ -1,7 +1,10 @@ { "string": { - "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { - "value": true + "LIBC_CONF_STRING_LENGTH_IMPL": { + "value": "generic", + }, + "LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": { + "value": "wide", } } } diff --git a/libc/config/linux/riscv/config.json b/libc/config/linux/riscv/config.json index e7ad4544b104d..caa16744d389f 100644 --- a/libc/config/linux/riscv/config.json +++ b/libc/config/linux/riscv/config.json @@ -1,7 +1,10 @@ { "string": { - "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { - "value": false + "LIBC_CONF_STRING_LENGTH_IMPL": { + "value": "element" + } + "LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": { + "value": "element" } } } diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index e23fc824ce7c8..3049738aff6e7 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -58,8 +58,9 @@ to learn about the defaults for your platform and target. * **"setjmp" options** - ``LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER``: Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved. * **"string" options** + - ``LIBC_CONF_FIND_FIRST_CHARACTER_IMPL``: Selects the implementation for find-first-character-related functions: 'element', 'wide', 'generic' (vector), or 'arch'. - ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled. - - ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen. + - ``LIBC_CONF_STRING_LENGTH_IMPL``: Selects the implementation for string-length: 'element', 'wide', 'generic' (vector), or 'arch'. * **"threads" options** - ``LIBC_CONF_THREAD_MODE``: The implementation used for Mutex, acceptable values are LIBC_THREAD_MODE_PLATFORM, LIBC_THREAD_MODE_SINGLE, and LIBC_THREAD_MODE_EXTERNAL. * **"time" options** diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h index 87f5ccdd56e23..f7385ce75a813 100644 --- a/libc/src/string/memory_utils/aarch64/inline_strlen.h +++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h @@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE_DECL { -namespace neon { +namespace internal::arch { [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t string_length(const char *src) { using Vector __attribute__((may_alias)) = uint8x8_t; @@ -44,9 +44,7 @@ string_length(const char *src) { (cpp::countr_zero(cmp) >> 3)); } } -} // namespace neon - -namespace string_length_impl = neon; +} // namespace internal::arch } // namespace LIBC_NAMESPACE_DECL #endif // __ARM_NEON diff --git a/libc/src/string/memory_utils/generic/inline_strlen.h b/libc/src/string/memory_utils/generic/inline_strlen.h index 69700e801bcea..7630c0b7caedf 100644 --- a/libc/src/string/memory_utils/generic/inline_strlen.h +++ b/libc/src/string/memory_utils/generic/inline_strlen.h @@ -14,7 +14,7 @@ #include "src/__support/common.h" namespace LIBC_NAMESPACE_DECL { -namespace internal { +namespace generic { // Exploit the underlying integer representation to do a variable shift. LIBC_INLINE constexpr cpp::simd_mask shift_mask(cpp::simd_mask m, @@ -46,9 +46,8 @@ LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) { cpp::find_first_set(mask); } } -} // namespace internal +} // namespace generic -namespace string_length_impl = internal; } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_INLINE_STRLEN_H diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h index 9e10d58363393..3d93960605f0c 100644 --- a/libc/src/string/memory_utils/x86_64/inline_strlen.h +++ b/libc/src/string/memory_utils/x86_64/inline_strlen.h @@ -15,7 +15,8 @@ namespace LIBC_NAMESPACE_DECL { -namespace string_length_internal { +namespace internal::arch { + // Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero. template LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static Mask @@ -92,15 +93,18 @@ namespace avx512 { } } // namespace avx512 #endif -} // namespace string_length_internal +[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { #if defined(__AVX512F__) -namespace string_length_impl = string_length_internal::avx512; + return avx512::string_length(src); #elif defined(__AVX2__) -namespace string_length_impl = string_length_internal::avx2; + return avx2::string_length(src); #else -namespace string_length_impl = string_length_internal::sse2; + return sse2::string_length(src); #endif +} + +} // namespace internal::arch } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h index 7feef56fb3676..f4cecc5973978 100644 --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -24,21 +24,56 @@ #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/string/memory_utils/inline_memcpy.h" -#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ) #if LIBC_HAS_VECTOR_TYPE #include "src/string/memory_utils/generic/inline_strlen.h" -#elif defined(LIBC_TARGET_ARCH_IS_X86) +#endif +#if defined(LIBC_TARGET_ARCH_IS_X86) #include "src/string/memory_utils/x86_64/inline_strlen.h" -#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON) +#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) #include "src/string/memory_utils/aarch64/inline_strlen.h" -#else -namespace string_length_impl = LIBC_NAMESPACE::wide_read; #endif -#endif // defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ) namespace LIBC_NAMESPACE_DECL { namespace internal { +#if !LIBC_HAS_VECTOR_TYPE +// Forward any generic vector impls to architecture specific ones +namespace arch {} +namespace generic = arch; +#endif + +namespace element { +// Element-by-element (usually a byte, but wider for wchar) implementations of +// functions that search for data. Slow, but easy to understand and analyze. + +// Returns the length of a string, denoted by the first occurrence +// of a null terminator. +LIBC_INLINE size_t string_length(const char *src) { + size_t length; + for (length = 0; *src; ++src, ++length) + ; + return length; +} + +template LIBC_INLINE size_t string_length_element(const T *src) { + size_t length; + for (length = 0; *src; ++src, ++length) + ; + return length; +} + +LIBC_INLINE void *find_first_character(const unsigned char *src, + unsigned char ch, size_t n) { + for (; n && *src != ch; --n, ++src) + ; + return n ? const_cast(src) : nullptr; +} +} // namespace element + +namespace wide { +// Generic, non-vector, implementations of functions that search for data +// by reading from memory block-by-block. + template LIBC_INLINE constexpr Word repeat_byte(Word byte) { static_assert(CHAR_BIT == 8, "repeat_byte assumes a byte is 8 bits."); constexpr size_t BITS_IN_BYTE = CHAR_BIT; @@ -74,8 +109,13 @@ template LIBC_INLINE constexpr bool has_zeroes(Word block) { return (subtracted & inverted & HIGH_BITS) != 0; } -template -LIBC_INLINE size_t string_length_wide_read(const char *src) { +// Unsigned int is the default size for most processors, and on x86-64 it +// performs better than larger sizes when the src pointer can't be assumed to +// be aligned to a word boundary, so it's the size we use for reading the +// string a block at a time. + +LIBC_INLINE size_t string_length(const char *src) { + using Word = unsigned int; const char *char_ptr = src; // Step 1: read 1 byte at a time to align to block size for (; reinterpret_cast(char_ptr) % sizeof(Word) != 0; @@ -95,37 +135,23 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) { return static_cast(char_ptr - src); } -namespace wide_read { -LIBC_INLINE size_t string_length(const char *src) { - // Unsigned int is the default size for most processors, and on x86-64 it - // performs better than larger sizes when the src pointer can't be assumed to - // be aligned to a word boundary, so it's the size we use for reading the - // string a block at a time. - return string_length_wide_read(src); -} - -} // namespace wide_read - -// Returns the length of a string, denoted by the first occurrence -// of a null terminator. -template LIBC_INLINE size_t string_length(const T *src) { -#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ - if constexpr (cpp::is_same_v) - return string_length_impl::string_length(src); -#endif - size_t length; - for (length = 0; *src; ++src, ++length) - ; - return length; -} - -template LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE void * -find_first_character_wide_read(const unsigned char *src, unsigned char ch, - size_t n) { +find_first_character(const unsigned char *src, unsigned char ch, + size_t max_strlen = cpp::numeric_limits::max()) { + using Word = unsigned int; const unsigned char *char_ptr = src; size_t cur = 0; + // If the maximum size of the string is small, the overhead of aligning to a + // word boundary and generating a bitmask of the appropriate size may be + // greater than the gains from reading larger chunks. Based on some testing, + // the crossover point between when it's faster to just read bytewise and read + // blocks is somewhere between 16 and 32, so 4 times the size of the block + // should be in that range. + if (max_strlen < (sizeof(Word) * 4)) { + return element::find_first_character(src, ch, max_strlen); + } + size_t n = max_strlen; // Step 1: read 1 byte at a time to align to block size for (; reinterpret_cast(char_ptr) % sizeof(Word) != 0 && cur < n; ++char_ptr, ++cur) { @@ -153,31 +179,35 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch, return const_cast(char_ptr); } -LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src, - unsigned char ch, size_t n) { - for (; n && *src != ch; --n, ++src) - ; - return n ? const_cast(src) : nullptr; +} // namespace wide + +// Dispatch mechanism for implementations of performance-sensitive +// functions. Always measure, but generally from lower- to higher-performance +// order: +// +// 1. element - read char-by-char or wchar-by-wchar +// 3. wide - read word-by-word +// 3. generic - read using clang's internal vector types +// 4. arch - hand-coded per architecture. Possibly in asm, or with intrinsics. +// +// The called implemenation is chosen at build-time by setting +// LIBC_CONF_{FUNC}_IMPL in config.json +static constexpr auto &string_length_impl = + LIBC_COPT_STRING_LENGTH_IMPL::string_length; +static constexpr auto &find_first_character_impl = + LIBC_COPT_FIND_FIRST_CHARACTER_IMPL::find_first_character; + +template LIBC_INLINE size_t string_length(const T *src) { + if constexpr (cpp::is_same_v) + return string_length_impl(src); + return element::string_length_element(src); } // Returns the first occurrence of 'ch' within the first 'n' characters of // 'src'. If 'ch' is not found, returns nullptr. LIBC_INLINE void *find_first_character(const unsigned char *src, unsigned char ch, size_t max_strlen) { -#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ - // If the maximum size of the string is small, the overhead of aligning to a - // word boundary and generating a bitmask of the appropriate size may be - // greater than the gains from reading larger chunks. Based on some testing, - // the crossover point between when it's faster to just read bytewise and read - // blocks is somewhere between 16 and 32, so 4 times the size of the block - // should be in that range. - // Unsigned int is used for the same reason as in strlen. - using BlockType = unsigned int; - if (max_strlen > (sizeof(BlockType) * 4)) { - return find_first_character_wide_read(src, ch, max_strlen); - } -#endif - return find_first_character_byte_read(src, ch, max_strlen); + return find_first_character_impl(src, ch, max_strlen); } // Returns the maximum length span that contains only characters not found in diff --git a/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl b/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl index 259d4d292fcf4..6166f52f80f8b 100644 --- a/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl +++ b/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl @@ -39,7 +39,8 @@ LIBC_CONFIGURE_OPTIONS = [ # "LIBC_COPT_SCANF_DISABLE_FLOAT", # "LIBC_COPT_SCANF_DISABLE_INDEX_MODE", "LIBC_COPT_STDIO_USE_SYSTEM_FILE", - "LIBC_COPT_STRING_UNSAFE_WIDE_READ", + "LIBC_COPT_STRING_LENGTH_IMPL=generic", + "LIBC_COPT_FIND_FIRST_CHARACTER_IMPL=wide", # "LIBC_COPT_STRTOFLOAT_DISABLE_CLINGER_FAST_PATH", # "LIBC_COPT_STRTOFLOAT_DISABLE_EISEL_LEMIRE", # "LIBC_COPT_STRTOFLOAT_DISABLE_SIMPLE_DECIMAL_CONVERSION",