774 changes: 0 additions & 774 deletions libc/src/string/memory_utils/elements.h

This file was deleted.

130 changes: 0 additions & 130 deletions libc/src/string/memory_utils/elements_aarch64.h

This file was deleted.

189 changes: 0 additions & 189 deletions libc/src/string/memory_utils/elements_x86.h

This file was deleted.

167 changes: 100 additions & 67 deletions libc/src/string/memory_utils/memcmp_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,92 +11,125 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t

namespace __llvm_libc {

static inline int inline_memcmp(const char *lhs, const char *rhs,
size_t count) {
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
static inline MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2,
size_t count) {
if (unlikely(count >= 384)) {
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)

#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::x86;
if (count == 0)
return 0;
if (count == 1)
return three_way_compare<_1>(lhs, rhs);
if (count == 2)
return three_way_compare<_2>(lhs, rhs);
if (count == 3)
return three_way_compare<_3>(lhs, rhs);
if (count <= 8)
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count <= 16)
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
size_t count) {
if (unlikely(count >= 384)) {
if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
}

static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
size_t count) {
if (count <= 32)
return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::aarch64;
if (count == 0) // [0, 0]
return 0;
if (count == 1) // [1, 1]
return three_way_compare<_1>(lhs, rhs);
if (count == 2) // [2, 2]
return three_way_compare<_2>(lhs, rhs);
if (count == 3) // [3, 3]
return three_way_compare<_3>(lhs, rhs);
if (count < 8) // [4, 7]
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count < 16) // [8, 15]
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (unlikely(count >= 128)) // [128, ∞]
return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
if (!equals<_16>(lhs, rhs)) // [16, 16]
return three_way_compare<_16>(lhs, rhs);
return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 384)) {
if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
return value;
align_to_next_boundary<32, Arg::P1>(p1, p2, count);
}
return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
}

static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 384)) {
if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
static inline MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2,
size_t count) {
if (unlikely(count >= 128)) { // [128, ∞]
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
}
if (count < 32) // [17, 31]
return three_way_compare<Tail<_16>>(lhs, rhs, count);
if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
return generic::Memcmp<16>::tail(p1, p2, count);
if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
if (count < 64) // [33, 63]
return three_way_compare<Tail<_32>>(lhs, rhs, count);
return generic::Memcmp<32>::tail(p1, p2, count);
// [64, 127]
return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::scalar;
return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)

static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
if (count == 0)
return 0;
return MemcmpReturnType::ZERO();
if (count == 1)
return three_way_compare<_1>(lhs, rhs);
return generic::Memcmp<1>::block(p1, p2);
if (count == 2)
return three_way_compare<_2>(lhs, rhs);
return generic::Memcmp<2>::block(p1, p2);
if (count == 3)
return three_way_compare<_3>(lhs, rhs);
return generic::Memcmp<3>::block(p1, p2);
if (count <= 8)
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
return generic::Memcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (count <= 32)
return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
if (count <= 64)
return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
if (count <= 128)
return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
return generic::Memcmp<8>::head_tail(p1, p2, count);
#if defined(LLVM_LIBC_ARCH_X86)
if constexpr (x86::kAvx512BW)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_memcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
if constexpr (aarch64::kNeon)
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
#endif
#elif defined(LLVM_LIBC_ARCH_ARM)
if (count == 0)
return MemcmpReturnType::ZERO();
return generic::Memcmp<1>::loop(p1, p2, count);
#else
#error "Unsupported platform"
#endif
}

Expand Down
122 changes: 45 additions & 77 deletions libc/src/string/memory_utils/memcpy_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t
Expand Down Expand Up @@ -41,112 +42,79 @@ static inline void inline_memcpy(char *__restrict dst,
const char *__restrict src, size_t count) {
using namespace __llvm_libc::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
// Whether to use rep;movsb exclusively, not at all, or only above a certain
// threshold.
// TODO: Use only a single preprocessor definition to simplify the code.
#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
#endif

// Whether to use only rep;movsb.
constexpr bool USE_ONLY_REP_MOVSB =
static constexpr bool kUseOnlyRepMovsb =
LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);

// kRepMovsBSize == -1 : Only CopyAligned is used.
// kRepMovsBSize == 0 : Only RepMovsb is used.
// else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
constexpr size_t REP_MOVS_B_SIZE =
#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
static constexpr size_t kRepMovsbThreshold =
LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
#else
-1;
#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE

// Whether target supports AVX instructions.
constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);

#if defined(__AVX__)
using LoopBlockSize = _64;
#else
using LoopBlockSize = _32;
#endif

if (USE_ONLY_REP_MOVSB)
return copy<x86::Accelerator>(dst, src, count);
if constexpr (kUseOnlyRepMovsb)
return x86::Memcpy::repmovsb(dst, src, count);

if (count == 0)
return;
if (count == 1)
return copy<_1>(dst, src);
return Memcpy<1>::block(dst, src);
if (count == 2)
return copy<_2>(dst, src);
return Memcpy<2>::block(dst, src);
if (count == 3)
return copy<_3>(dst, src);
return Memcpy<3>::block(dst, src);
if (count == 4)
return copy<_4>(dst, src);
return Memcpy<4>::block(dst, src);
if (count < 8)
return copy<HeadTail<_4>>(dst, src, count);
return Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
return copy<HeadTail<_8>>(dst, src, count);
return Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
return copy<HeadTail<_16>>(dst, src, count);
return Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
return copy<HeadTail<_32>>(dst, src, count);
return Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
return copy<HeadTail<_64>>(dst, src, count);
if (HAS_AVX && count < 256)
return copy<HeadTail<_128>>(dst, src, count);
if (count <= REP_MOVS_B_SIZE)
return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
count);
return copy<x86::Accelerator>(dst, src, count);
return Memcpy<64>::head_tail(dst, src, count);
if (x86::kAvx && count < 256)
return Memcpy<128>::head_tail(dst, src, count);
if (count <= kRepMovsbThreshold) {
Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
return Memcpy < x86::kAvx ? 64 : 32 > ::loop_and_tail(dst, src, count);
}
return x86::Memcpy::repmovsb(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
if (count == 0)
return;
if (count == 1)
return copy<_1>(dst, src);
return Memcpy<1>::block(dst, src);
if (count == 2)
return copy<_2>(dst, src);
return Memcpy<2>::block(dst, src);
if (count == 3)
return copy<_3>(dst, src);
return Memcpy<3>::block(dst, src);
if (count == 4)
return copy<_4>(dst, src);
return Memcpy<4>::block(dst, src);
if (count < 8)
return copy<HeadTail<_4>>(dst, src, count);
return Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
return copy<HeadTail<_8>>(dst, src, count);
return Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
return copy<HeadTail<_16>>(dst, src, count);
return Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
return copy<HeadTail<_32>>(dst, src, count);
return Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
return Memcpy<64>::head_tail(dst, src, count);
Memcpy<16>::block(dst, src);
align_to_next_boundary<16, Arg::Src>(dst, src, count);
return Memcpy<64>::loop_and_tail(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_ARM)
if (count == 0)
return;
if (count == 1)
return copy<_1>(dst, src);
if (count == 2)
return copy<_2>(dst, src);
if (count == 3)
return copy<_3>(dst, src);
if (count == 4)
return copy<_4>(dst, src);
if (count < 8)
return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
return generic::Memcpy<1>::loop(dst, src, count);
#else
#error "Unsupported platform"
#endif
}

Expand Down
99 changes: 43 additions & 56 deletions libc/src/string/memory_utils/memset_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H

#include "src/__support/architectures.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t
Expand Down Expand Up @@ -48,88 +50,73 @@ namespace __llvm_libc {
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void inline_memset(char *dst, unsigned char value, size_t count) {
inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::x86;
static constexpr size_t kMaxSize = x86::kAvx512F ? 64
: x86::kAvx ? 32
: x86::kSse2 ? 16
: 8;
if (count == 0)
return;
if (count == 1)
return splat_set<_1>(dst, value);
return generic::Memset<1, kMaxSize>::block(dst, value);
if (count == 2)
return splat_set<_2>(dst, value);
return generic::Memset<2, kMaxSize>::block(dst, value);
if (count == 3)
return splat_set<_3>(dst, value);
return generic::Memset<3, kMaxSize>::block(dst, value);
if (count <= 8)
return splat_set<HeadTail<_4>>(dst, value, count);
return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
if (count <= 16)
return splat_set<HeadTail<_8>>(dst, value, count);
return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
if (count <= 32)
return splat_set<HeadTail<_16>>(dst, value, count);
return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
if (count <= 64)
return splat_set<HeadTail<_32>>(dst, value, count);
return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
if (count <= 128)
return splat_set<HeadTail<_64>>(dst, value, count);
return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
// Aligned loop
generic::Memset<32, kMaxSize>::block(dst, value);
align_to_next_boundary<32>(dst, count);
return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::aarch64_memset;
static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
if (count == 0)
return;
if (count <= 3) {
splat_set<_1>(dst, value);
generic::Memset<1, kMaxSize>::block(dst, value);
if (count > 1)
splat_set<Tail<_2>>(dst, value, count);
generic::Memset<2, kMaxSize>::tail(dst, value, count);
return;
}
if (count <= 8)
return splat_set<HeadTail<_4>>(dst, value, count);
return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
if (count <= 16)
return splat_set<HeadTail<_8>>(dst, value, count);
return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
if (count <= 32)
return splat_set<HeadTail<_16>>(dst, value, count);
return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
if (count <= (32 + 64)) {
splat_set<_32>(dst, value);
generic::Memset<32, kMaxSize>::block(dst, value);
if (count <= 64)
return splat_set<Tail<_32>>(dst, value, count);
splat_set<Skip<32>::Then<_32>>(dst, value);
splat_set<Tail<_32>>(dst, value, count);
return generic::Memset<32, kMaxSize>::tail(dst, value, count);
generic::Memset<32, kMaxSize>::block(dst + 32, value);
generic::Memset<32, kMaxSize>::tail(dst, value, count);
return;
}
if (count >= 448 && value == 0 && hasZva())
return splat_set<Align<_64, Arg::_1>::Then<Loop<Zva64, _64>>>(dst, 0,
count);
else
return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::scalar;

if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
generic::Memset<64, kMaxSize>::block(dst, 0);
align_to_next_boundary<64>(dst, count);
return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
} else {
generic::Memset<16, kMaxSize>::block(dst, value);
align_to_next_boundary<16>(dst, count);
return generic::Memset<64, kMaxSize>::loop_and_tail(dst, value, count);
}
#elif defined(LLVM_LIBC_ARCH_ARM)
if (count == 0)
return;
if (count == 1)
return splat_set<_1>(dst, value);
if (count == 2)
return splat_set<_2>(dst, value);
if (count == 3)
return splat_set<_3>(dst, value);
if (count <= 8)
return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= 64)
return splat_set<HeadTail<_32>>(dst, value, count);
if (count <= 128)
return splat_set<HeadTail<_64>>(dst, value, count);
return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
return generic::Memset<1, 1>::loop(dst, value, count);
#else
#error "Unsupported platform"
#endif
}

Expand Down
174 changes: 174 additions & 0 deletions libc/src/string/memory_utils/op_aarch64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
//===-- aarch64 implementation of memory function building blocks ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides aarch64 specific building blocks to compose memory
// functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H

#include "src/__support/architectures.h"

#if defined(LLVM_LIBC_ARCH_AARCH64)

#include "src/string/memory_utils/op_generic.h"

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif //__ARM_NEON

namespace __llvm_libc::aarch64 {

static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);

namespace neon {

template <size_t Size> struct BzeroCacheLine {
static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, uint8_t) {
static_assert(Size == 64);
#if __SIZEOF_POINTER__ == 4
asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
#else
asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
#endif
}

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
static_assert(Size > 1);
size_t offset = 0;
do {
block(dst + offset, value);
offset += SIZE;
} while (offset < count - SIZE);
// Unaligned store, we can't use 'dc zva' here.
static constexpr size_t kMaxSize = kNeon ? 16 : 8;
generic::Memset<Size, kMaxSize>::tail(dst, value, count);
}
};

inline static bool hasZva() {
uint64_t zva_val;
asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
// DC ZVA is permitted if DZP, bit [4] is zero.
// BS, bits [3:0] is log2 of the block count in words.
// So the next line checks whether the instruction is permitted and block
// count is 16 words (i.e. 64 bytes).
return (zva_val & 0b11111) == 0b00100;
}

} // namespace neon

///////////////////////////////////////////////////////////////////////////////
// Memset

///////////////////////////////////////////////////////////////////////////////
// Bcmp
template <size_t Size> struct Bcmp {
static constexpr size_t SIZE = Size;
static constexpr size_t BlockSize = 32;

static const unsigned char *as_u8(CPtr ptr) {
return reinterpret_cast<const unsigned char *>(ptr);
}

static inline BcmpReturnType block(CPtr p1, CPtr p2) {
if constexpr (Size == BlockSize) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t b = vld1q_u8(_p1 + 16);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t o = vld1q_u8(_p2 + 16);
uint8x16_t an = veorq_u8(a, n);
uint8x16_t bo = veorq_u8(b, o);
// anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
// a difference between the two buffers. We reduce this value down to 4
// bytes in two steps. First, calculate the saturated move value when
// going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
// a single 32 bit nonzero value if a mismatch occurred.
uint8x16_t anbo = vorrq_u8(an, bo);
uint32x2_t anbo_reduced = vqmovn_u64(anbo);
return vmaxv_u32(anbo_reduced);
} else if constexpr ((Size % BlockSize) == 0) {
for (size_t offset = 0; offset < Size; offset += BlockSize)
if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
return value;
} else {
deferred_static_assert("SIZE not implemented");
}
return BcmpReturnType::ZERO();
}

static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
return block(p1 + count - SIZE, p2 + count - SIZE);
}

static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
if constexpr (Size <= 8) {
return generic::Bcmp<Size>::head_tail(p1, p2, count);
} else if constexpr (Size == 16) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t b = vld1q_u8(_p1 + count - 16);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t o = vld1q_u8(_p2 + count - 16);
uint8x16_t an = veorq_s8(a, n);
uint8x16_t bo = veorq_s8(b, o);
// anbo = (a ^ n) | (b ^ o)
uint8x16_t anbo = vorrq_s8(an, bo);
uint32x2_t anbo_reduced = vqmovn_u64(anbo);
return vmaxv_u32(anbo_reduced);
} else if constexpr (Size == 32) {
auto _p1 = as_u8(p1);
auto _p2 = as_u8(p2);
uint8x16_t a = vld1q_u8(_p1);
uint8x16_t b = vld1q_u8(_p1 + 16);
uint8x16_t c = vld1q_u8(_p1 + count - 16);
uint8x16_t d = vld1q_u8(_p1 + count - 32);
uint8x16_t n = vld1q_u8(_p2);
uint8x16_t o = vld1q_u8(_p2 + 16);
uint8x16_t p = vld1q_u8(_p2 + count - 16);
uint8x16_t q = vld1q_u8(_p2 + count - 32);
uint8x16_t an = veorq_s8(a, n);
uint8x16_t bo = veorq_s8(b, o);
uint8x16_t cp = veorq_s8(c, p);
uint8x16_t dq = veorq_s8(d, q);
uint8x16_t anbo = vorrq_s8(an, bo);
uint8x16_t cpdq = vorrq_s8(cp, dq);
// abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
// a nonzero 32 bit value if a mismatch occurred.
uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
return vmaxv_u32(abnocpdq_reduced);
} else {
deferred_static_assert("SIZE not implemented");
}
return BcmpReturnType::ZERO();
}

static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
static_assert(Size > 1);
size_t offset = 0;
do {
if (auto value = block(p1 + offset, p2 + offset))
return value;
offset += SIZE;
} while (offset < count - SIZE);
return tail(p1, p2, count);
}
};

} // namespace __llvm_libc::aarch64

#endif // LLVM_LIBC_ARCH_AARCH64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
155 changes: 155 additions & 0 deletions libc/src/string/memory_utils/op_builtin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
//===-- Implementation using the __builtin_XXX_inline ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides generic C++ building blocks to compose memory functions.
// They rely on the compiler to generate the best possible code through the use
// of the `__builtin_XXX_inline` builtins. These builtins are currently only
// available in Clang.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H

#include "src/string/memory_utils/utils.h"

namespace __llvm_libc::builtin {

///////////////////////////////////////////////////////////////////////////////
// Memcpy
template <size_t Size> struct Memcpy {
static constexpr size_t SIZE = Size;
static inline void block(Ptr __restrict dst, CPtr __restrict src) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
return __builtin_memcpy_inline(dst, src, SIZE);
#else
deferred_static_assert("Missing __builtin_memcpy_inline");
(void)dst;
(void)src;
#endif
}

static inline void tail(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
block(dst + count - SIZE, src + count - SIZE);
}

static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
block(dst, src);
tail(dst, src, count);
}

static inline void loop(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
static_assert(Size == 1);
for (size_t offset = 0; offset < count; offset += SIZE)
block(dst + offset, src + offset);
}

static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
static_assert(Size > 1);
size_t offset = 0;
do {
block(dst + offset, src + offset);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, src, count);
}
};

///////////////////////////////////////////////////////////////////////////////
// Memset
template <size_t Size> struct Memset {
using ME = Memset;
static constexpr size_t SIZE = Size;
static inline void block(Ptr dst, uint8_t value) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
__builtin_memset_inline(dst, value, Size);
#else
deferred_static_assert("Missing __builtin_memset_inline");
(void)dst;
(void)value;
#endif
}

static inline void tail(Ptr dst, uint8_t value, size_t count) {
block(dst + count - SIZE, value);
}

static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
block(dst, value);
tail(dst, value, count);
}

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
static_assert(Size > 1);
size_t offset = 0;
do {
block(dst + offset, value);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
}
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp
template <size_t Size> struct Bcmp {
using ME = Bcmp;
static constexpr size_t SIZE = Size;
static inline BcmpReturnType block(CPtr, CPtr) {
deferred_static_assert("Missing __builtin_memcmp_inline");
return BcmpReturnType::ZERO();
}

static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
deferred_static_assert("Not implemented");
return BcmpReturnType::ZERO();
}

static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
deferred_static_assert("Not implemented");
return BcmpReturnType::ZERO();
}

static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
deferred_static_assert("Not implemented");
return BcmpReturnType::ZERO();
}
};

///////////////////////////////////////////////////////////////////////////////
// Memcmp
template <size_t Size> struct Memcmp {
using ME = Memcmp;
static constexpr size_t SIZE = Size;
static inline MemcmpReturnType block(CPtr, CPtr) {
deferred_static_assert("Missing __builtin_memcmp_inline");
return MemcmpReturnType::ZERO();
}

static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
deferred_static_assert("Not implemented");
return MemcmpReturnType::ZERO();
}

static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
deferred_static_assert("Not implemented");
return MemcmpReturnType::ZERO();
}

static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
deferred_static_assert("Not implemented");
return MemcmpReturnType::ZERO();
}
};

} // namespace __llvm_libc::builtin

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
502 changes: 502 additions & 0 deletions libc/src/string/memory_utils/op_generic.h

Large diffs are not rendered by default.

219 changes: 219 additions & 0 deletions libc/src/string/memory_utils/op_x86.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
//===-- x86 implementation of memory function building blocks -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides x86 specific building blocks to compose memory functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H

#include "src/__support/architectures.h"

#if defined(LLVM_LIBC_ARCH_X86_64)

#include "src/__support/common.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"

#ifdef __SSE2__
#include <immintrin.h>
#else
// Define fake functions to prevent the compiler from failing on undefined
// functions in case SSE2 is not present.
#define _mm512_cmpneq_epi8_mask(A, B) 0
#define _mm_movemask_epi8(A) 0
#define _mm256_movemask_epi8(A) 0
#endif // __SSE2__

namespace __llvm_libc::x86 {

// A set of constants to check compile time features.
static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);

///////////////////////////////////////////////////////////////////////////////
// Memcpy repmovsb implementation
struct Memcpy {
static void repmovsb(char *dst, const char *src, size_t count) {
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
}
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp

// Base implementation for the Bcmp specializations.
// - BlockSize is either 16, 32 or 64 depending on the available compile time
// features, it is used to switch between "single native operation" or a
// "sequence of native operations".
// - BlockBcmp is the function that implements the bcmp logic.
template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
static inline BcmpReturnType block(CPtr p1, CPtr p2) {
if constexpr (Size == BlockSize) {
return BlockBcmp(p1, p2);
} else if constexpr (Size % BlockSize == 0) {
for (size_t offset = 0; offset < Size; offset += BlockSize)
if (auto value = BlockBcmp(p1 + offset, p2 + offset))
return value;
} else {
deferred_static_assert("SIZE not implemented");
}
return BcmpReturnType::ZERO();
}

static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
return block(p1 + count - Size, p2 + count - Size);
}

static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
return block(p1, p2) | tail(p1, p2, count);
}

static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
static_assert(Size > 1);
size_t offset = 0;
do {
if (auto value = block(p1 + offset, p2 + offset))
return value;
offset += Size;
} while (offset < count - Size);
return tail(p1, p2, count);
}
};

namespace sse2 {
static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
using T = char __attribute__((__vector_size__(16)));
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
return static_cast<uint32_t>(mask);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
} // namespace sse2

namespace avx2 {
static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
using T = char __attribute__((__vector_size__(32)));
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
// _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
// mask.
return static_cast<uint32_t>(mask);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
} // namespace avx2

namespace avx512bw {
static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
using T = char __attribute__((__vector_size__(64)));
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
const bool mask_is_set = mask != 0;
return static_cast<uint32_t>(mask_is_set);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
} // namespace avx512bw

// Assuming that the mask is non zero, the index of the first mismatching byte
// is the number of trailing zeros in the mask. Trailing zeros and not leading
// zeros because the x86 architecture is little endian.
static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
uint64_t mask) {
const size_t diff_index = __builtin_ctzll(mask);
const int16_t ca = p1[diff_index];
const int16_t cb = p2[diff_index];
return ca - cb;
}

///////////////////////////////////////////////////////////////////////////////
// Memcmp

// Base implementation for the Memcmp specializations.
// - BlockSize is either 16, 32 or 64 depending on the available compile time
// features, it is used to switch between "single native operation" or a
// "sequence of native operations".
// - BlockMemcmp is the function that implements the memcmp logic.
// - BlockBcmp is the function that implements the bcmp logic.
template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
struct MemcmpImpl {
static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
if constexpr (Size == BlockSize) {
return BlockMemcmp(p1, p2);
} else if constexpr (Size % BlockSize == 0) {
for (size_t offset = 0; offset < Size; offset += BlockSize)
if (auto value = BlockBcmp(p1 + offset, p2 + offset))
return BlockMemcmp(p1 + offset, p2 + offset);
} else {
deferred_static_assert("SIZE not implemented");
}
return MemcmpReturnType::ZERO();
}

static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
return block(p1 + count - Size, p2 + count - Size);
}

static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
if (auto value = block(p1, p2))
return value;
return tail(p1, p2, count);
}

static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
static_assert(Size > 1);
size_t offset = 0;
do {
if (auto value = block(p1 + offset, p2 + offset))
return value;
offset += Size;
} while (offset < count - Size);
return tail(p1, p2, count);
}
};

namespace sse2 {
static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
using T = char __attribute__((__vector_size__(16)));
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
} // namespace sse2

namespace avx2 {
static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
using T = char __attribute__((__vector_size__(32)));
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
} // namespace avx2

namespace avx512bw {
static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
using T = char __attribute__((__vector_size__(64)));
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
} // namespace avx512bw

} // namespace __llvm_libc::x86

#endif // LLVM_LIBC_ARCH_X86_64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
139 changes: 88 additions & 51 deletions libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,8 @@
#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
#define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H

#include "src/__support/architectures.h"

// Cache line sizes for ARM: These values are not strictly correct since
// cache line sizes depend on implementations, not architectures. There
// are even implementations with cache line sizes configurable at boot
// time.
#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
#define LLVM_LIBC_CACHELINE_SIZE 64
#elif defined(LLVM_LIBC_ARCH_ARM)
#define LLVM_LIBC_CACHELINE_SIZE 32
#else
#error "Unsupported platform for memory functions."
#endif
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/type_traits.h"

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
Expand Down Expand Up @@ -62,32 +51,46 @@ static constexpr size_t ge_power2(size_t value) {
return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}

template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
// Returns the number of bytes to substract from ptr to get to the previous
// multiple of alignment. If ptr is already aligned returns 0.
template <size_t alignment> uintptr_t distance_to_align_down(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns 0.
template <size_t alignment> uintptr_t distance_to_align_up(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
// The logic is not straightforward and involves unsigned modulo arithmetic
// but the generated code is as fast as it can be.
return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the offset from `ptr` to the next cache line.
static inline intptr_t offset_to_next_cache_line(const void *ptr) {
return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns alignment.
template <size_t alignment>
uintptr_t distance_to_next_aligned(const void *ptr) {
return alignment - distance_to_align_down<alignment>(ptr);
}

// Returns the same pointer but notifies the compiler that it is aligned.
template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
}

#if defined __has_builtin
#if __has_builtin(__builtin_memcpy_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
#endif

#if defined __has_builtin
#if __has_builtin(__builtin_memset_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
#endif
#endif

// Performs a constant count copy.
template <size_t Size>
static inline void memcpy_inline(void *__restrict dst,
Expand All @@ -103,28 +106,56 @@ static inline void memcpy_inline(void *__restrict dst,
using Ptr = char *; // Pointer to raw data.
using CPtr = const char *; // Const pointer to raw data.

// Loads bytes from memory (possibly unaligned) and materializes them as type.
// This type makes sure that we don't accidentally promote an integral type to
// another one. It is only constructible from the exact T type.
template <typename T> struct StrictIntegralType {
static_assert(cpp::is_integral_v<T>);

// Can only be constructed from a T.
template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
StrictIntegralType(U value) : value(value) {}

// Allows using the type in an if statement.
explicit operator bool() const { return value; }

// If type is unsigned (bcmp) we allow bitwise OR operations.
StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
static_assert(!cpp::is_signed_v<T>);
return value | Rhs.value;
}

// For interation with the C API we allow explicit conversion back to the
// `int` type.
explicit operator int() const {
// bit_cast makes sure that T and int have the same size.
return cpp::bit_cast<int>(value);
}

// Helper to get the zero value.
static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }

private:
T value;
};

using MemcmpReturnType = StrictIntegralType<int32_t>;
using BcmpReturnType = StrictIntegralType<uint32_t>;

// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
template <typename T> static inline T load(CPtr ptr) {
T Out;
memcpy_inline<sizeof(T)>(&Out, ptr);
return Out;
}

// Stores a value of type T in memory (possibly unaligned)
// Stores a value of type T in memory (possibly unaligned).
template <typename T> static inline void store(Ptr ptr, T value) {
memcpy_inline<sizeof(T)>(ptr, &value);
}

// For an operation like memset that operates on a pointer and a count, advances
// the pointer by offset bytes and decrease count by the same amount.
static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) {
ptr += offset;
count -= offset;
}

// For an operation like memcpy or memcmp that operates on two pointers and a
// count, advances the pointers by offset bytes and decrease count by the same
// amount.
// Advances the pointers p1 and p2 by offset bytes and decrease count by the
// same amount.
template <typename T1, typename T2>
static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
T2 *__restrict &p2, size_t &count) {
Expand All @@ -133,31 +164,37 @@ static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
count -= offset;
}

// For an operation like memset that operates on a pointer and a count, advances
// the pointer so it is aligned to SIZE bytes and decrease count by the same
// amount.
// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
// and decrease count by the same amount.
// We make sure the compiler knows about the adjusted pointer alignment.
template <size_t SIZE> void align(Ptr &ptr, size_t &count) {
adjust(offset_to_next_aligned<SIZE>(ptr), ptr, count);
ptr = assume_aligned<SIZE>(ptr);
template <size_t SIZE, typename T1, typename T2>
void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
size_t &count) {
adjust(distance_to_next_aligned<SIZE>(p1), p1, p2, count);
p1 = assume_aligned<SIZE>(p1);
}

// For an operation like memcpy or memcmp that operates on two pointers and a
// count, advances the pointers so one of them gets aligned to SIZE bytes and
// decrease count by the same amount.
// We make sure the compiler knows about the adjusted pointer alignment.
enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
// Same as align_p1_to_next_boundary above but with a single pointer instead.
template <size_t SIZE, typename T1>
void align_to_next_boundary(T1 *&p1, size_t &count) {
CPtr dummy;
align_p1_to_next_boundary<SIZE>(p1, dummy, count);
}

// An enum class that discriminates between the first and second pointer.
enum class Arg { P1, P2, Dst = P1, Src = P2 };

// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
// Precondition: &p1 != &p2
template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) {
if constexpr (AlignOn == Arg::_1) {
adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
p1 = assume_aligned<SIZE>(p1);
} else if constexpr (AlignOn == Arg::_2) {
adjust(offset_to_next_aligned<SIZE>(p2), p1, p2, count);
p2 = assume_aligned<SIZE>(p2);
} else {
deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2");
}
void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
size_t &count) {
if constexpr (AlignOn == Arg::P1)
align_p1_to_next_boundary<SIZE>(p1, p2, count);
else if constexpr (AlignOn == Arg::P2)
align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
else
deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
}

} // namespace __llvm_libc
Expand Down
4 changes: 2 additions & 2 deletions libc/src/string/memset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
namespace __llvm_libc {

LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
inline_memset(reinterpret_cast<char *>(dst),
static_cast<unsigned char>(value), count);
inline_memset(reinterpret_cast<char *>(dst), static_cast<uint8_t>(value),
count);
return dst;
}

Expand Down
12 changes: 6 additions & 6 deletions libc/test/src/string/bcmp_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,25 @@
TEST(LlvmLibcBcmpTest, CmpZeroByte) {
const char *lhs = "ab";
const char *rhs = "bc";
EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
}

TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
const char *lhs = "ab";
const char *rhs = "ab";
EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
const char *lhs = "ab";
const char *rhs = "ac";
EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
const char *lhs = "ac";
const char *rhs = "ab";
EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcBcmpTest, Sweep) {
Expand All @@ -46,13 +46,13 @@ TEST(LlvmLibcBcmpTest, Sweep) {
reset(lhs);
reset(rhs);
for (size_t i = 0; i < K_MAX_SIZE; ++i)
EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);

reset(lhs);
reset(rhs);
for (size_t i = 0; i < K_MAX_SIZE; ++i) {
rhs[i] = 'b';
EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
rhs[i] = 'a';
}
}
14 changes: 7 additions & 7 deletions libc/test/src/string/memmove_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ TEST(LlvmLibcMemmoveTest, MoveZeroByte) {
void *const Dst = Buffer;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
Expand All @@ -29,7 +29,7 @@ TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
void *const Dst = Buffer;
void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
Expand All @@ -40,7 +40,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
void *const Dst = Buffer + 1;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
Expand All @@ -49,7 +49,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
void *const Dst = Buffer + 2;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}

// e.g. `Dst` follow `src`.
Expand All @@ -62,7 +62,7 @@ TEST(LlvmLibcMemmoveTest, SrcFollowDst) {
void *const Dst = Buffer + 1;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
Expand All @@ -71,7 +71,7 @@ TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
void *const Dst = Buffer + 2;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}

static constexpr int kMaxSize = 512;
Expand Down Expand Up @@ -106,7 +106,7 @@ TEST(LlvmLibcMemmoveTest, Thorough) {
void *const Ret =
__llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size);
EXPECT_EQ(Ret, Dst);
EXPECT_MEM_EQ(Buffer, Expected);
ASSERT_MEM_EQ(Buffer, Expected);
}
}
}
2 changes: 0 additions & 2 deletions libc/test/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ add_libc_unittest(
SUITE
libc_string_unittests
SRCS
elements_test.cpp
memory_access_test.cpp
utils_test.cpp
COMPILE_OPTIONS
${LIBC_COMPILE_OPTIONS_NATIVE}
Expand Down
137 changes: 0 additions & 137 deletions libc/test/src/string/memory_utils/elements_test.cpp

This file was deleted.

228 changes: 0 additions & 228 deletions libc/test/src/string/memory_utils/memory_access_test.cpp

This file was deleted.

79 changes: 26 additions & 53 deletions libc/test/src/string/memory_utils/utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,55 +72,41 @@ TEST(LlvmLibcUtilsTest, GEPowerOf2) {
EXPECT_EQ(ge_power2(i), kExpectedValues[i]);
}

using I = intptr_t;
using UINT = uintptr_t;

// Converts an offset into a pointer.
const void *forge(size_t offset) {
return reinterpret_cast<const void *>(offset);
}

TEST(LlvmLibcUtilsTest, OffsetToNextAligned) {
EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0));
EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15));
EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0));
EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1));
EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
TEST(LlvmLibcUtilsTest, DistanceToNextAligned) {
EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16));
EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15));
EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16));
EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1));
EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16));
}

TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) {
EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
TEST(LlvmLibcUtilsTest, DistanceToAlignUp) {
EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0));
EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15));
EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0));
EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1));
EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16));
}

TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) {
EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
EXPECT_EQ(offset_to_next_cache_line(forge(1)),
I(LLVM_LIBC_CACHELINE_SIZE - 1));
EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0));
EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)),
I(1));
}

TEST(LlvmLibcUtilsTest, Adjust1) {
char a;
const size_t base_size = 10;
for (size_t I = -2; I < 2; ++I) {
auto *ptr = &a;
size_t size = base_size;
adjust(I, ptr, size);
EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I));
EXPECT_EQ(size, base_size - I);
}
TEST(LlvmLibcUtilsTest, DistanceToAlignDown) {
EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0));
EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1));
EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0));
EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15));
EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16));
}

TEST(LlvmLibcUtilsTest, Adjust2) {
char a, b;
const size_t base_size = 10;
for (size_t I = -2; I < 2; ++I) {
for (ptrdiff_t I = -2; I < 2; ++I) {
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
Expand All @@ -131,41 +117,28 @@ TEST(LlvmLibcUtilsTest, Adjust2) {
}
}

TEST(LlvmLibcUtilsTest, Align1) {
char a;
const size_t base_size = 10;
{
auto *ptr = &a;
size_t size = base_size;
align<128>(ptr, size);
EXPECT_TRUE(uintptr_t(ptr) % 128 == 0);
EXPECT_GE(ptr, &a);
EXPECT_EQ(size_t(ptr - &a), base_size - size);
}
}

TEST(LlvmLibcUtilsTest, Align2) {
char a, b;
const size_t base_size = 10;
{
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
align<128, Arg::_1>(p1, p2, size);
align_to_next_boundary<128, Arg::P1>(p1, p2, size);
EXPECT_TRUE(uintptr_t(p1) % 128 == 0);
EXPECT_GE(p1, &a);
EXPECT_GE(p2, &b);
EXPECT_GT(p1, &a);
EXPECT_GT(p2, &b);
EXPECT_EQ(size_t(p1 - &a), base_size - size);
EXPECT_EQ(size_t(p2 - &b), base_size - size);
}
{
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
align<128, Arg::_2>(p1, p2, size);
align_to_next_boundary<128, Arg::P2>(p1, p2, size);
EXPECT_TRUE(uintptr_t(p2) % 128 == 0);
EXPECT_GE(p1, &a);
EXPECT_GE(p2, &b);
EXPECT_GT(p1, &a);
EXPECT_GT(p2, &b);
EXPECT_EQ(size_t(p1 - &a), base_size - size);
EXPECT_EQ(size_t(p2 - &b), base_size - size);
}
Expand Down
9 changes: 6 additions & 3 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -973,9 +973,10 @@ no_sanitize_features = [
cc_library(
name = "string_memory_utils",
hdrs = [
"src/string/memory_utils/elements.h",
"src/string/memory_utils/elements_aarch64.h",
"src/string/memory_utils/elements_x86.h",
"src/string/memory_utils/op_aarch64.h",
"src/string/memory_utils/op_builtin.h",
"src/string/memory_utils/op_generic.h",
"src/string/memory_utils/op_x86.h",
"src/string/memory_utils/utils.h",
],
textual_hdrs = [
Expand All @@ -988,6 +989,8 @@ cc_library(
deps = [
":__support_common",
":__support_cpp_bit",
":__support_cpp_type_traits",
":__support_cpp_array",
":libc_root",
],
)
Expand Down