Skip to content

Conversation

@SchrodingerZhu
Copy link
Contributor

@SchrodingerZhu SchrodingerZhu commented Nov 10, 2025

This PR creates an SVE-based implementation for strlen by translating from the AOR code in tree. Microbenchmark shows improvements against NEON when N>=64. Although both implementations fall behind glibc by a large margin,
this may be a good start point to explore SVE implementations.

Together with the PR:

  1. Added two more tests of strlen with special nul symbols.
  2. Added strlen's fuzzer and fix a typo in previous heap fuzzer.
=== strlen(16 bytes) ===
libc: 1.56115 ns/call, 9.54499 GiB/s
neon: 1.59393 ns/call, 9.34867 GiB/s
sve: 1.66097 ns/call, 8.97134 GiB/s

=== strlen(64 bytes) ===
libc: 2.06967 ns/call, 28.7991 GiB/s
neon: 2.59914 ns/call, 22.9325 GiB/s
sve: 2.58628 ns/call, 23.0465 GiB/s

=== strlen(256 bytes) ===
libc: 3.74165 ns/call, 63.7202 GiB/s
neon: 8.98243 ns/call, 26.5428 GiB/s
sve: 7.36426 ns/call, 32.3751 GiB/s

=== strlen(1024 bytes) ===
libc: 10.5327 ns/call, 90.5438 GiB/s
neon: 34.363 ns/call, 27.7529 GiB/s
sve: 26.9329 ns/call, 35.4092 GiB/s

=== strlen(4096 bytes) ===
libc: 37.7304 ns/call, 101.104 GiB/s
neon: 145.911 ns/call, 26.144 GiB/s
sve: 103.208 ns/call, 36.9612 GiB/s

=== strlen(1048576 bytes) ===
libc: 9623.4 ns/call, 101.478 GiB/s
neon: 36138.2 ns/call, 27.023 GiB/s
sve: 26605.6 ns/call, 36.7051 GiB/s

@llvmbot llvmbot added the libc label Nov 10, 2025
@SchrodingerZhu
Copy link
Contributor Author

SchrodingerZhu commented Nov 10, 2025

Microbenchmark is slightly modified from AI generated code:

// -*- C++ -*-
// Standalone SVE/NEON/libc strlen microbenchmark (always registers all)

#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <tuple>
#include <vector>

#include <arm_neon.h>
#include <arm_sve.h>

#define LIBC_LIKELY(x)   __builtin_expect(!!(x), 1)
#define LIBC_UNLIKELY(x) __builtin_expect(!!(x), 0)

// -----------------------------------------------------------------------------
// NEON implementation
// -----------------------------------------------------------------------------
namespace neon {
[[maybe_unused]] static inline size_t string_length(const char* src) {
  using Vector __attribute__((may_alias)) = uint8x8_t;

  uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
  const Vector* block_ptr =
      reinterpret_cast<const Vector*>(src - misalign_bytes);

  Vector v = *block_ptr;
  Vector vcmp = vceqz_u8(v);
  uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
  uint64_t cmp = vget_lane_u64(cmp_mask, 0);
  cmp >>= (misalign_bytes << 3);
  if (cmp) return __builtin_ctzll(cmp) >> 3;

  while (true) {
    ++block_ptr;
    v = *block_ptr;
    vcmp = vceqz_u8(v);
    cmp_mask = vreinterpret_u64_u8(vcmp);
    cmp = vget_lane_u64(vcmp, 0);
    if (cmp) {
      size_t base = reinterpret_cast<uintptr_t>(block_ptr) -
                    reinterpret_cast<uintptr_t>(src);
      return base + ((__builtin_ctzll(cmp)) >> 3);
    }
  }
}
} // namespace neon

// -----------------------------------------------------------------------------
// SVE implementation
// -----------------------------------------------------------------------------
namespace sve {
[[maybe_unused]] static inline size_t string_length(const char* src) {
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(src);
  svsetffr();
  const svbool_t all_true = svptrue_b8();
  svbool_t cmp_zero;
  size_t len = 0;

  for (;;) {
    svuint8_t data = svldff1_u8(all_true, &ptr[len]);
    svbool_t fault_mask = svrdffr_z(all_true);
    bool no_fault = svptest_last(all_true, fault_mask);
    if (LIBC_LIKELY(no_fault)) {
      len += svcntb();
      cmp_zero = svcmpeq_n_u8(all_true, data, 0);
      bool has_no_zero = !svptest_any(all_true, cmp_zero);
      if (LIBC_LIKELY(has_no_zero))
        continue;
      len -= svcntb();
      break;
    } else {
      cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
      bool has_zero = svptest_any(fault_mask, cmp_zero);
      if (LIBC_LIKELY(has_zero)) break;
      len += svcntp_b8(all_true, fault_mask);
      svsetffr();
      continue;
    }
  }
  svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
  len += svcntp_b8(all_true, before_zero);
  return len;
}
} // namespace sve

// -----------------------------------------------------------------------------
// libc fallback
// -----------------------------------------------------------------------------
namespace syslibc {
static inline size_t string_length(const char* s) { return std::strlen(s); }
} // namespace syslibc

// -----------------------------------------------------------------------------
// Benchmark harness
// -----------------------------------------------------------------------------
struct Impl {
  const char* name;
  size_t (*fn)(const char*);
};

static std::vector<Impl> get_impls() {
  return {
    {"libc", &syslibc::string_length},
    {"neon", &neon::string_length},
    {"sve",  &sve::string_length}
  };
}

struct Result {
  std::string name;
  double ns_per_call;
  double gib_per_s;
};

static inline uint64_t now_ns() {
  using clock = std::chrono::steady_clock;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch()).count();
}

// correctness check
static bool run_correctness(const std::vector<Impl>& impls) {
  bool ok = true;
  std::vector<size_t> sizes = {0,1,3,7,8,9,15,16,31,32,63,64,127,128,255,256,511,512,1023,1024,4096};
  for (size_t n : sizes) {
    std::unique_ptr<char[]> s(new char[n + 2]);
    std::fill(s.get(), s.get() + n, 'A');
    s[n] = 0;
    size_t ref = syslibc::string_length(s.get());
    for (auto& impl : impls) {
      size_t got = impl.fn(s.get());
      if (got != ref) {
        std::cerr << "FAIL " << impl.name << " len=" << n
                  << " got=" << got << " ref=" << ref << "\n";
        ok = false;
      }
    }
  }
  return ok;
}

static Result bench(const Impl& impl, size_t size, size_t reps) {
  std::unique_ptr<char[]> buf(new char[size + 1]);
  std::fill(buf.get(), buf.get() + size, 'X');
  buf[size] = 0;

  volatile size_t dummy = 0;
  uint64_t t0 = now_ns();
  for (size_t i = 0; i < reps; ++i)
    dummy += impl.fn(buf.get());
  uint64_t t1 = now_ns();
  double ns_call = double(t1 - t0) / reps;
  double gib_s = (double(size) * reps) / ((t1 - t0) * 1e-9) / (1024.0 * 1024.0 * 1024.0);
  (void)dummy;
  return {impl.name, ns_call, gib_s};
}

int main() {
  auto impls = get_impls();
  std::cout << "Implementations:";
  for (auto& i : impls) std::cout << " " << i.name;
  std::cout << "\n";

  if (!run_correctness(impls)) {
    std::cerr << "Correctness check failed!\n";
    return 1;
  }

  std::vector<size_t> sizes = {16, 64, 256, 1024, 4096, 1<<20};
  for (size_t s : sizes) {
    std::cout << "\n=== strlen(" << s << " bytes) ===\n";
    for (auto& impl : impls) {
      Result r = bench(impl, s, 1000000 / std::max<size_t>(1, s/16));
      std::cout << impl.name << ": " << r.ns_per_call << " ns/call, "
                << r.gib_per_s << " GiB/s\n";
    }
  }
  return 0;
}

@llvmbot
Copy link
Member

llvmbot commented Nov 10, 2025

@llvm/pr-subscribers-libc

Author: Schrodinger ZHU Yifan (SchrodingerZhu)

Changes

This PR creates an SVE-based implementation for strlen. Microbenchmark shows improvements against NEON when N>=64. Although both implementations fall behind glibc by a large margin,
this may be a good start point to explore SVE implementations.

Together with the PR:

  1. Added two more tests of strlen with special nul symbols.
  2. Added strlen's fuzzer and fix a typo in previous heap fuzzer.
=== strlen(16 bytes) ===
libc: 1.56115 ns/call, 9.54499 GiB/s
neon: 1.59393 ns/call, 9.34867 GiB/s
sve: 1.66097 ns/call, 8.97134 GiB/s

=== strlen(64 bytes) ===
libc: 2.06967 ns/call, 28.7991 GiB/s
neon: 2.59914 ns/call, 22.9325 GiB/s
sve: 2.58628 ns/call, 23.0465 GiB/s

=== strlen(256 bytes) ===
libc: 3.74165 ns/call, 63.7202 GiB/s
neon: 8.98243 ns/call, 26.5428 GiB/s
sve: 7.36426 ns/call, 32.3751 GiB/s

=== strlen(1024 bytes) ===
libc: 10.5327 ns/call, 90.5438 GiB/s
neon: 34.363 ns/call, 27.7529 GiB/s
sve: 26.9329 ns/call, 35.4092 GiB/s

=== strlen(4096 bytes) ===
libc: 37.7304 ns/call, 101.104 GiB/s
neon: 145.911 ns/call, 26.144 GiB/s
sve: 103.208 ns/call, 36.9612 GiB/s

=== strlen(1048576 bytes) ===
libc: 9623.4 ns/call, 101.478 GiB/s
neon: 36138.2 ns/call, 27.023 GiB/s
sve: 26605.6 ns/call, 36.7051 GiB/s

Full diff: https://github.com/llvm/llvm-project/pull/167259.diff

6 Files Affected:

  • (modified) libc/fuzzing/__support/freelist_heap_fuzz.cpp (+1-1)
  • (modified) libc/fuzzing/string/CMakeLists.txt (+8)
  • (added) libc/fuzzing/string/strlen_fuzz.cpp (+32)
  • (modified) libc/src/string/memory_utils/aarch64/inline_strlen.h (+58-5)
  • (modified) libc/src/string/string_utils.h (+11-3)
  • (modified) libc/test/src/string/strlen_test.cpp (+12)
diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
index 7b7985a83c3e6..0b400cb156491 100644
--- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp
+++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
@@ -24,7 +24,7 @@ asm(R"(
 _end:
   .fill 1024
 __llvm_libc_heap_limit:
-)";
+)");
 
 using LIBC_NAMESPACE::FreeListHeap;
 using LIBC_NAMESPACE::inline_memset;
diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt
index efda80b59c951..0918e92552ea7 100644
--- a/libc/fuzzing/string/CMakeLists.txt
+++ b/libc/fuzzing/string/CMakeLists.txt
@@ -40,3 +40,11 @@ add_libc_fuzzer(
   DEPENDS
     libc.src.strings.bcmp
 )
+
+add_libc_fuzzer(
+  strlen_fuzz
+  SRCS
+    strlen_fuzz.cpp
+  DEPENDS
+    libc.src.string.strlen
+)
diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp
new file mode 100644
index 0000000000000..dd72c19b7fdc7
--- /dev/null
+++ b/libc/fuzzing/string/strlen_fuzz.cpp
@@ -0,0 +1,32 @@
+//===-- strlen_fuzz.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc strlen implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/string/strlen.h"
+#include <cstdint>
+#include <cstring>
+
+// always null terminate the data
+extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size);
+extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size,
+                                          size_t max_size, unsigned int seed) {
+  size = LLVMFuzzerMutate(data, size, max_size);
+  data[size - 1] = '\0';
+  return size;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  size_t ref = ::strlen(reinterpret_cast<const char *>(data));
+  size_t impl = LIBC_NAMESPACE::strlen(reinterpret_cast<const char *>(data));
+  if (ref != impl)
+    __builtin_trap();
+  return 0;
+}
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
index 87f5ccdd56e23..5d6dfec7e91e5 100644
--- a/libc/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -8,14 +8,13 @@
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 
+#include "src/__support/macros/properties/cpu_features.h"
+
 #if defined(__ARM_NEON)
 #include "src/__support/CPP/bit.h" // countr_zero
-
 #include <arm_neon.h>
 #include <stddef.h> // size_t
-
 namespace LIBC_NAMESPACE_DECL {
-
 namespace neon {
 [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
 string_length(const char *src) {
@@ -45,9 +44,63 @@ string_length(const char *src) {
   }
 }
 } // namespace neon
+} // namespace LIBC_NAMESPACE_DECL
+#endif // __ARM_NEON
 
-namespace string_length_impl = neon;
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+#include "src/__support/macros/optimization.h"
+#include <arm_sve.h>
+namespace LIBC_NAMESPACE_DECL {
+namespace sve {
+[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) {
+  const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src);
+  // Initialize the first-fault register to all true
+  svsetffr();
+  const svbool_t all_true = svptrue_b8(); // all true predicate
+  svbool_t cmp_zero;
+  size_t len = 0;
 
+  for (;;) {
+    // Read a vector's worth of bytes, stopping on first fault.
+    svuint8_t data = svldff1_u8(all_true, ptr);
+    svbool_t fault_mask = svrdffr_z(all_true);
+    bool has_no_fault = svptest_last(all_true, fault_mask);
+    if (LIBC_LIKELY(has_no_fault)) {
+      // First fault did not fail: the whole vector is valid.
+      // Avoid depending on the contents of FFR beyond the branch.
+      len += svcntb(); // speculative increment
+      cmp_zero = svcmpeq_n_u8(all_true, data, 0);
+      bool has_no_zero = !svptest_any(all_true, cmp_zero);
+      if (LIBC_LIKELY(has_no_zero))
+        continue;
+      len -= svcntb(); // undo speculative increment
+      break;
+    } else {
+      // First fault failed: only some of the vector is valid.
+      // Perform the comparison only on the valid bytes.
+      cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
+      bool has_zero = svptest_any(fault_mask, cmp_zero);
+      if (LIBC_LIKELY(has_zero))
+        break;
+      svsetffr();
+      len += svcntp_b8(all_true, fault_mask);
+      continue;
+    }
+  }
+  // Select the bytes before the first and count them.
+  svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
+  len += svcntp_b8(all_true, before_zero);
+  return len;
+}
+} // namespace sve
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_TARGET_CPU_HAS_SVE
+
+namespace LIBC_NAMESPACE_DECL {
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+namespace string_length_impl = sve;
+#elif defined(__ARM_NEON)
+namespace string_length_impl = neon;
+#endif
 } // namespace LIBC_NAMESPACE_DECL
-#endif // __ARM_NEON
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index cbce62ead0328..c4984883addb7 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -22,9 +22,17 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
-#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
+#if !defined(LIBC_TARGET_CPU_HAS_SVE)
+#error "SVE is not supported on this CPU"
+#endif
+
+// SVE implementation has fault safety
+#if defined(LIBC_TARGET_CPU_HAS_SVE)
+#include "src/string/memory_utils/aarch64/inline_strlen.h"
+#elif defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
 #if LIBC_HAS_VECTOR_TYPE
 #include "src/string/memory_utils/generic/inline_strlen.h"
 #elif defined(LIBC_TARGET_ARCH_IS_X86)
@@ -33,8 +41,8 @@
 #include "src/string/memory_utils/aarch64/inline_strlen.h"
 #else
 namespace string_length_impl = LIBC_NAMESPACE::wide_read;
-#endif
-#endif // defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
+#endif // LIBC_TARGET_CPU_HAS_SVE
+#endif // defined(LIBC_TARGET_CPU_HAS_SVE)
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
diff --git a/libc/test/src/string/strlen_test.cpp b/libc/test/src/string/strlen_test.cpp
index 4eb9d47e9209d..784dd7b194b3f 100644
--- a/libc/test/src/string/strlen_test.cpp
+++ b/libc/test/src/string/strlen_test.cpp
@@ -22,3 +22,15 @@ TEST(LlvmLibcStrLenTest, AnyString) {
   size_t result = LIBC_NAMESPACE::strlen(any);
   ASSERT_EQ((size_t)12, result);
 }
+
+TEST(LlvmLibcStrLenTest, DataAfterNulString) {
+  constexpr char A[10] = {'a', 'b', 'c', 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+  size_t result = LIBC_NAMESPACE::strlen(A);
+  ASSERT_EQ((size_t)6, result);
+}
+
+TEST(LlvmLibcStrLenTest, MultipleNulsInOneWord) {
+  constexpr char A[10] = {'a', 'b', 0, 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+  size_t result = LIBC_NAMESPACE::strlen(A);
+  ASSERT_EQ((size_t)2, result);
+}

@SchrodingerZhu SchrodingerZhu merged commit 8751f26 into llvm:main Nov 11, 2025
20 checks passed
@SchrodingerZhu SchrodingerZhu deleted the libc/aarch64-scalable-strlen branch November 11, 2025 00:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants