diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 2d2cc42655d42..e4ccf0ffdc340 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -26,6 +26,10 @@ set(LIBC_BUILD_SCRIPTS_DIR "${LIBC_SOURCE_DIR}/utils/build_scripts") # Flags to pass down to the compiler while building the libc functions. set(LIBC_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Architecture to tell clang to optimize for (e.g. -march=... or -mcpu=...)") +include(common_libc_tuners.cmake) + +list(APPEND LIBC_COMPILE_OPTIONS_DEFAULT ${LIBC_COMMON_TUNE_OPTIONS}) + # Check --print-resource-dir to find the compiler resource dir if this flag # is supported by the compiler. execute_process( diff --git a/libc/common_libc_tuners.cmake b/libc/common_libc_tuners.cmake new file mode 100644 index 0000000000000..cde28fadcbf03 --- /dev/null +++ b/libc/common_libc_tuners.cmake @@ -0,0 +1,14 @@ +# ------------------------------------------------------------------------------ +# Common tuning option definitions. +# ------------------------------------------------------------------------------ + +set(LIBC_COMMON_TUNE_OPTIONS "") + +option(LIBC_UNSAFE_STRING_WIDE_READ "Functions searching for the first character in a string such as strlen will read the string as int sized blocks instead of bytes. This relies on undefined behavior and may fail on some systems, but improves performance on long strings." OFF) +if(LIBC_UNSAFE_STRING_WIDE_READ) + if(LLVM_USE_SANITIZER) + message(FATAL_ERROR "LIBC_UNSAFE_STRING_WIDE_READ is set at the same time as a sanitizer. LIBC_UNSAFE_STRING_WIDE_READ causes strlen and memchr to read beyond the end of their target strings, which is undefined behavior caught by sanitizers.") + else() + list(APPEND LIBC_COMMON_TUNE_OPTIONS "-DLIBC_UNSAFE_STRING_WIDE_READ") + endif() +endif() diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 2c960af216fc6..7719178da457b 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -195,6 +195,7 @@ add_entrypoint_object( HDRS strlen.h DEPENDS + .string_utils libc.include.string ) diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h index f8de6964983ce..24be2ab889b54 100644 --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -23,24 +23,142 @@ namespace __llvm_libc { namespace internal { -// Returns the length of a string, denoted by the first occurrence -// of a null terminator. -static inline size_t string_length(const char *src) { +template constexpr Word repeat_byte(Word byte) { + constexpr size_t BITS_IN_BYTE = 8; + constexpr size_t BYTE_MASK = 0xff; + Word result = 0; + byte = byte & BYTE_MASK; + for (size_t i = 0; i < sizeof(Word); ++i) + result = (result << BITS_IN_BYTE) | byte; + return result; +} + +// The goal of this function is to take in a block of arbitrary size and return +// if it has any bytes equal to zero without branching. This is done by +// transforming the block such that zero bytes become non-zero and non-zero +// bytes become zero. +// The first transformation relies on the properties of carrying in arithmetic +// subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00, +// then the result for that byte must be equal to 0xff (or 0xfe if the next byte +// needs a carry as well). +// The next transformation is a simple mask. All zero bytes will have the high +// bit set after the subtraction, so each byte is masked with 0x80. This narrows +// the set of bytes that result in a non-zero value to only zero bytes and bytes +// with the high bit and any other bit set. +// The final transformation masks the result of the previous transformations +// with the inverse of the original byte. This means that any byte that had the +// high bit set will no longer have it set, narrowing the list of bytes which +// result in non-zero values to just the zero byte. +template constexpr bool has_zeroes(Word block) { + constexpr Word LOW_BITS = repeat_byte(0x01); + constexpr Word HIGH_BITS = repeat_byte(0x80); + Word subtracted = block - LOW_BITS; + Word inverted = ~block; + return (subtracted & inverted & HIGH_BITS) != 0; +} + +template +static inline size_t string_length_wide_read(const char *src) { + const char *char_ptr = src; + // Step 1: read 1 byte at a time to align to block size + for (; reinterpret_cast(char_ptr) % sizeof(Word) != 0; + ++char_ptr) { + if (*char_ptr == '\0') + return char_ptr - src; + } + // Step 2: read blocks + for (const Word *block_ptr = reinterpret_cast(char_ptr); + !has_zeroes(*block_ptr); ++block_ptr) { + char_ptr = reinterpret_cast(block_ptr); + } + // Step 3: find the zero in the block + for (; *char_ptr != '\0'; ++char_ptr) { + ; + } + return char_ptr - src; +} + +static inline size_t string_length_byte_read(const char *src) { size_t length; for (length = 0; *src; ++src, ++length) ; return length; } -// Returns the first occurrence of 'ch' within the first 'n' characters of -// 'src'. If 'ch' is not found, returns nullptr. -static inline void *find_first_character(const unsigned char *src, - unsigned char ch, size_t n) { +// Returns the length of a string, denoted by the first occurrence +// of a null terminator. +static inline size_t string_length(const char *src) { +#ifdef LIBC_UNSAFE_STRING_WIDE_READ + // Unsigned int is the default size for most processors, and on x86-64 it + // performs better than larger sizes when the src pointer can't be assumed to + // be aligned to a word boundary, so it's the size we use for reading the + // string a block at a time. + return string_length_wide_read(src); +#else + return string_length_byte_read(src); +#endif +} + +template +static inline void *find_first_character_wide_read(const unsigned char *src, + unsigned char ch, size_t n) { + const unsigned char *char_ptr = src; + size_t cur = 0; + + // Step 1: read 1 byte at a time to align to block size + for (; reinterpret_cast(char_ptr) % sizeof(Word) != 0 && cur < n; + ++char_ptr, ++cur) { + if (*char_ptr == ch) + return const_cast(char_ptr); + } + + const Word ch_mask = repeat_byte(ch); + + // Step 2: read blocks + for (const Word *block_ptr = reinterpret_cast(char_ptr); + !has_zeroes((*block_ptr) ^ ch_mask) && cur < n; + ++block_ptr, cur += sizeof(Word)) { + char_ptr = reinterpret_cast(block_ptr); + } + + // Step 3: find the match in the block + for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) { + ; + } + + if (*char_ptr != ch || cur >= n) + return static_cast(nullptr); + + return const_cast(char_ptr); +} + +static inline void *find_first_character_byte_read(const unsigned char *src, + unsigned char ch, size_t n) { for (; n && *src != ch; --n, ++src) ; return n ? const_cast(src) : nullptr; } +// Returns the first occurrence of 'ch' within the first 'n' characters of +// 'src'. If 'ch' is not found, returns nullptr. +static inline void *find_first_character(const unsigned char *src, + unsigned char ch, size_t max_strlen) { +#ifdef LIBC_UNSAFE_STRING_WIDE_READ + // If the maximum size of the string is small, the overhead of aligning to a + // word boundary and generating a bitmask of the appropriate size may be + // greater than the gains from reading larger chunks. Based on some testing, + // the crossover point between when it's faster to just read bytewise and read + // blocks is somewhere between 16 and 32, so 4 times the size of the block + // should be in that range. + // Unsigned int is used for the same reason as in strlen. + using BlockType = unsigned int; + if (max_strlen > (sizeof(BlockType) * 4)) { + return find_first_character_wide_read(src, ch, max_strlen); + } +#endif + return find_first_character_byte_read(src, ch, max_strlen); +} + // Returns the maximum length span that contains only characters not found in // 'segment'. If no characters are found, returns the length of 'src'. static inline size_t complementary_span(const char *src, const char *segment) {