-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[libc] add an SVE implementation of strlen #167259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] add an SVE implementation of strlen #167259
Conversation
|
Microbenchmark is slightly modified from AI generated code: // -*- C++ -*-
// Standalone SVE/NEON/libc strlen microbenchmark (always registers all)
#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <tuple>
#include <vector>
#include <arm_neon.h>
#include <arm_sve.h>
#define LIBC_LIKELY(x) __builtin_expect(!!(x), 1)
#define LIBC_UNLIKELY(x) __builtin_expect(!!(x), 0)
// -----------------------------------------------------------------------------
// NEON implementation
// -----------------------------------------------------------------------------
namespace neon {
[[maybe_unused]] static inline size_t string_length(const char* src) {
using Vector __attribute__((may_alias)) = uint8x8_t;
uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
const Vector* block_ptr =
reinterpret_cast<const Vector*>(src - misalign_bytes);
Vector v = *block_ptr;
Vector vcmp = vceqz_u8(v);
uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
uint64_t cmp = vget_lane_u64(cmp_mask, 0);
cmp >>= (misalign_bytes << 3);
if (cmp) return __builtin_ctzll(cmp) >> 3;
while (true) {
++block_ptr;
v = *block_ptr;
vcmp = vceqz_u8(v);
cmp_mask = vreinterpret_u64_u8(vcmp);
cmp = vget_lane_u64(vcmp, 0);
if (cmp) {
size_t base = reinterpret_cast<uintptr_t>(block_ptr) -
reinterpret_cast<uintptr_t>(src);
return base + ((__builtin_ctzll(cmp)) >> 3);
}
}
}
} // namespace neon
// -----------------------------------------------------------------------------
// SVE implementation
// -----------------------------------------------------------------------------
namespace sve {
[[maybe_unused]] static inline size_t string_length(const char* src) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(src);
svsetffr();
const svbool_t all_true = svptrue_b8();
svbool_t cmp_zero;
size_t len = 0;
for (;;) {
svuint8_t data = svldff1_u8(all_true, &ptr[len]);
svbool_t fault_mask = svrdffr_z(all_true);
bool no_fault = svptest_last(all_true, fault_mask);
if (LIBC_LIKELY(no_fault)) {
len += svcntb();
cmp_zero = svcmpeq_n_u8(all_true, data, 0);
bool has_no_zero = !svptest_any(all_true, cmp_zero);
if (LIBC_LIKELY(has_no_zero))
continue;
len -= svcntb();
break;
} else {
cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
bool has_zero = svptest_any(fault_mask, cmp_zero);
if (LIBC_LIKELY(has_zero)) break;
len += svcntp_b8(all_true, fault_mask);
svsetffr();
continue;
}
}
svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
len += svcntp_b8(all_true, before_zero);
return len;
}
} // namespace sve
// -----------------------------------------------------------------------------
// libc fallback
// -----------------------------------------------------------------------------
namespace syslibc {
static inline size_t string_length(const char* s) { return std::strlen(s); }
} // namespace syslibc
// -----------------------------------------------------------------------------
// Benchmark harness
// -----------------------------------------------------------------------------
struct Impl {
const char* name;
size_t (*fn)(const char*);
};
static std::vector<Impl> get_impls() {
return {
{"libc", &syslibc::string_length},
{"neon", &neon::string_length},
{"sve", &sve::string_length}
};
}
struct Result {
std::string name;
double ns_per_call;
double gib_per_s;
};
static inline uint64_t now_ns() {
using clock = std::chrono::steady_clock;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch()).count();
}
// correctness check
static bool run_correctness(const std::vector<Impl>& impls) {
bool ok = true;
std::vector<size_t> sizes = {0,1,3,7,8,9,15,16,31,32,63,64,127,128,255,256,511,512,1023,1024,4096};
for (size_t n : sizes) {
std::unique_ptr<char[]> s(new char[n + 2]);
std::fill(s.get(), s.get() + n, 'A');
s[n] = 0;
size_t ref = syslibc::string_length(s.get());
for (auto& impl : impls) {
size_t got = impl.fn(s.get());
if (got != ref) {
std::cerr << "FAIL " << impl.name << " len=" << n
<< " got=" << got << " ref=" << ref << "\n";
ok = false;
}
}
}
return ok;
}
static Result bench(const Impl& impl, size_t size, size_t reps) {
std::unique_ptr<char[]> buf(new char[size + 1]);
std::fill(buf.get(), buf.get() + size, 'X');
buf[size] = 0;
volatile size_t dummy = 0;
uint64_t t0 = now_ns();
for (size_t i = 0; i < reps; ++i)
dummy += impl.fn(buf.get());
uint64_t t1 = now_ns();
double ns_call = double(t1 - t0) / reps;
double gib_s = (double(size) * reps) / ((t1 - t0) * 1e-9) / (1024.0 * 1024.0 * 1024.0);
(void)dummy;
return {impl.name, ns_call, gib_s};
}
int main() {
auto impls = get_impls();
std::cout << "Implementations:";
for (auto& i : impls) std::cout << " " << i.name;
std::cout << "\n";
if (!run_correctness(impls)) {
std::cerr << "Correctness check failed!\n";
return 1;
}
std::vector<size_t> sizes = {16, 64, 256, 1024, 4096, 1<<20};
for (size_t s : sizes) {
std::cout << "\n=== strlen(" << s << " bytes) ===\n";
for (auto& impl : impls) {
Result r = bench(impl, s, 1000000 / std::max<size_t>(1, s/16));
std::cout << impl.name << ": " << r.ns_per_call << " ns/call, "
<< r.gib_per_s << " GiB/s\n";
}
}
return 0;
} |
|
@llvm/pr-subscribers-libc Author: Schrodinger ZHU Yifan (SchrodingerZhu) ChangesThis PR creates an SVE-based implementation for strlen. Microbenchmark shows improvements against NEON when N>=64. Although both implementations fall behind glibc by a large margin, Together with the PR:
Full diff: https://github.com/llvm/llvm-project/pull/167259.diff 6 Files Affected:
diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
index 7b7985a83c3e6..0b400cb156491 100644
--- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp
+++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
@@ -24,7 +24,7 @@ asm(R"(
_end:
.fill 1024
__llvm_libc_heap_limit:
-)";
+)");
using LIBC_NAMESPACE::FreeListHeap;
using LIBC_NAMESPACE::inline_memset;
diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt
index efda80b59c951..0918e92552ea7 100644
--- a/libc/fuzzing/string/CMakeLists.txt
+++ b/libc/fuzzing/string/CMakeLists.txt
@@ -40,3 +40,11 @@ add_libc_fuzzer(
DEPENDS
libc.src.strings.bcmp
)
+
+add_libc_fuzzer(
+ strlen_fuzz
+ SRCS
+ strlen_fuzz.cpp
+ DEPENDS
+ libc.src.string.strlen
+)
diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp
new file mode 100644
index 0000000000000..dd72c19b7fdc7
--- /dev/null
+++ b/libc/fuzzing/string/strlen_fuzz.cpp
@@ -0,0 +1,32 @@
+//===-- strlen_fuzz.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc strlen implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/string/strlen.h"
+#include <cstdint>
+#include <cstring>
+
+// always null terminate the data
+extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size);
+extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size,
+ size_t max_size, unsigned int seed) {
+ size = LLVMFuzzerMutate(data, size, max_size);
+ data[size - 1] = '\0';
+ return size;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ size_t ref = ::strlen(reinterpret_cast<const char *>(data));
+ size_t impl = LIBC_NAMESPACE::strlen(reinterpret_cast<const char *>(data));
+ if (ref != impl)
+ __builtin_trap();
+ return 0;
+}
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
index 87f5ccdd56e23..5d6dfec7e91e5 100644
--- a/libc/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -8,14 +8,13 @@
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
+#include "src/__support/macros/properties/cpu_features.h"
+
#if defined(__ARM_NEON)
#include "src/__support/CPP/bit.h" // countr_zero
-
#include <arm_neon.h>
#include <stddef.h> // size_t
-
namespace LIBC_NAMESPACE_DECL {
-
namespace neon {
[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
string_length(const char *src) {
@@ -45,9 +44,63 @@ string_length(const char *src) {
}
}
} // namespace neon
+} // namespace LIBC_NAMESPACE_DECL
+#endif // __ARM_NEON
-namespace string_length_impl = neon;
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+#include "src/__support/macros/optimization.h"
+#include <arm_sve.h>
+namespace LIBC_NAMESPACE_DECL {
+namespace sve {
+[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) {
+ const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src);
+ // Initialize the first-fault register to all true
+ svsetffr();
+ const svbool_t all_true = svptrue_b8(); // all true predicate
+ svbool_t cmp_zero;
+ size_t len = 0;
+ for (;;) {
+ // Read a vector's worth of bytes, stopping on first fault.
+ svuint8_t data = svldff1_u8(all_true, ptr);
+ svbool_t fault_mask = svrdffr_z(all_true);
+ bool has_no_fault = svptest_last(all_true, fault_mask);
+ if (LIBC_LIKELY(has_no_fault)) {
+ // First fault did not fail: the whole vector is valid.
+ // Avoid depending on the contents of FFR beyond the branch.
+ len += svcntb(); // speculative increment
+ cmp_zero = svcmpeq_n_u8(all_true, data, 0);
+ bool has_no_zero = !svptest_any(all_true, cmp_zero);
+ if (LIBC_LIKELY(has_no_zero))
+ continue;
+ len -= svcntb(); // undo speculative increment
+ break;
+ } else {
+ // First fault failed: only some of the vector is valid.
+ // Perform the comparison only on the valid bytes.
+ cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
+ bool has_zero = svptest_any(fault_mask, cmp_zero);
+ if (LIBC_LIKELY(has_zero))
+ break;
+ svsetffr();
+ len += svcntp_b8(all_true, fault_mask);
+ continue;
+ }
+ }
+ // Select the bytes before the first and count them.
+ svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
+ len += svcntp_b8(all_true, before_zero);
+ return len;
+}
+} // namespace sve
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_TARGET_CPU_HAS_SVE
+
+namespace LIBC_NAMESPACE_DECL {
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+namespace string_length_impl = sve;
+#elif defined(__ARM_NEON)
+namespace string_length_impl = neon;
+#endif
} // namespace LIBC_NAMESPACE_DECL
-#endif // __ARM_NEON
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index cbce62ead0328..c4984883addb7 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -22,9 +22,17 @@
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h"
#include "src/string/memory_utils/inline_memcpy.h"
-#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
+#if !defined(LIBC_TARGET_CPU_HAS_SVE)
+#error "SVE is not supported on this CPU"
+#endif
+
+// SVE implementation has fault safety
+#if defined(LIBC_TARGET_CPU_HAS_SVE)
+#include "src/string/memory_utils/aarch64/inline_strlen.h"
+#elif defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
#if LIBC_HAS_VECTOR_TYPE
#include "src/string/memory_utils/generic/inline_strlen.h"
#elif defined(LIBC_TARGET_ARCH_IS_X86)
@@ -33,8 +41,8 @@
#include "src/string/memory_utils/aarch64/inline_strlen.h"
#else
namespace string_length_impl = LIBC_NAMESPACE::wide_read;
-#endif
-#endif // defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
+#endif // LIBC_TARGET_CPU_HAS_SVE
+#endif // defined(LIBC_TARGET_CPU_HAS_SVE)
namespace LIBC_NAMESPACE_DECL {
namespace internal {
diff --git a/libc/test/src/string/strlen_test.cpp b/libc/test/src/string/strlen_test.cpp
index 4eb9d47e9209d..784dd7b194b3f 100644
--- a/libc/test/src/string/strlen_test.cpp
+++ b/libc/test/src/string/strlen_test.cpp
@@ -22,3 +22,15 @@ TEST(LlvmLibcStrLenTest, AnyString) {
size_t result = LIBC_NAMESPACE::strlen(any);
ASSERT_EQ((size_t)12, result);
}
+
+TEST(LlvmLibcStrLenTest, DataAfterNulString) {
+ constexpr char A[10] = {'a', 'b', 'c', 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+ size_t result = LIBC_NAMESPACE::strlen(A);
+ ASSERT_EQ((size_t)6, result);
+}
+
+TEST(LlvmLibcStrLenTest, MultipleNulsInOneWord) {
+ constexpr char A[10] = {'a', 'b', 0, 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+ size_t result = LIBC_NAMESPACE::strlen(A);
+ ASSERT_EQ((size_t)2, result);
+}
|
90aae3f to
de76b6a
Compare
This PR creates an SVE-based implementation for strlen by translating from the AOR code in tree. Microbenchmark shows improvements against NEON when N>=64. Although both implementations fall behind glibc by a large margin,
this may be a good start point to explore SVE implementations.
Together with the PR: