774 changes: 774 additions & 0 deletions libc/src/string/memory_utils/elements.h

Large diffs are not rendered by default.

130 changes: 130 additions & 0 deletions libc/src/string/memory_utils/elements_aarch64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
//===-- Elementary operations for aarch64 --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H

#include "src/__support/architectures.h"

#if defined(LLVM_LIBC_ARCH_AARCH64)

#include <src/string/memory_utils/elements.h>
#include <stddef.h> // size_t
#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif

namespace __llvm_libc {
namespace aarch64_memset {
#ifdef __ARM_NEON
struct Splat8 {
static constexpr size_t SIZE = 8;
static void splat_set(char *dst, const unsigned char value) {
vst1_u8((uint8_t *)dst, vdup_n_u8(value));
}
};

struct Splat16 {
static constexpr size_t SIZE = 16;
static void splat_set(char *dst, const unsigned char value) {
vst1q_u8((uint8_t *)dst, vdupq_n_u8(value));
}
};

using _8 = Splat8;
using _16 = Splat16;
#else
using _8 = __llvm_libc::scalar::_8;
using _16 = Repeated<_8, 2>;
#endif // __ARM_NEON

using _1 = __llvm_libc::scalar::_1;
using _2 = __llvm_libc::scalar::_2;
using _3 = __llvm_libc::scalar::_3;
using _4 = __llvm_libc::scalar::_4;
using _32 = Chained<_16, _16>;
using _64 = Chained<_32, _32>;

struct Zva64 {
static constexpr size_t SIZE = 64;

static void splat_set(char *dst, const unsigned char) {
#if __SIZEOF_POINTER__ == 4
asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
#else
asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
#endif
}
};

inline static bool hasZva() {
uint64_t zva_val;
asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
// DC ZVA is permitted if DZP, bit [4] is zero.
// BS, bits [3:0] is log2 of the block size in words.
// So the next line checks whether the instruction is permitted and block size
// is 16 words (i.e. 64 bytes).
return (zva_val & 0b11111) == 0b00100;
}

} // namespace aarch64_memset

namespace aarch64 {

using _1 = __llvm_libc::scalar::_1;
using _2 = __llvm_libc::scalar::_2;
using _3 = __llvm_libc::scalar::_3;
using _4 = __llvm_libc::scalar::_4;
using _8 = __llvm_libc::scalar::_8;
using _16 = __llvm_libc::scalar::_16;

#ifdef __ARM_NEON
struct N32 {
static constexpr size_t SIZE = 32;
static bool equals(const char *lhs, const char *rhs) {
uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
uint64_t res =
vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
return res == 0;
}
static int three_way_compare(const char *lhs, const char *rhs) {
uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
uint64_t res =
vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
if (res == 0)
return 0;
size_t index = (__builtin_ctzl(res) >> 3) << 2;
uint32_t l = *((const uint32_t *)(lhs + index));
uint32_t r = *((const uint32_t *)(rhs + index));
return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r);
}
};

using _32 = N32;
using _64 = Repeated<_32, 2>;
#else
using _32 = __llvm_libc::scalar::_32;
using _64 = __llvm_libc::scalar::_64;
#endif // __ARM_NEON

} // namespace aarch64
} // namespace __llvm_libc

#endif // defined(LLVM_LIBC_ARCH_AARCH64)

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
189 changes: 189 additions & 0 deletions libc/src/string/memory_utils/elements_x86.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
//===-- Elementary operations for x86 -------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H

#include "src/__support/CPP/bit.h"
#include "src/__support/architectures.h"

#if defined(LLVM_LIBC_ARCH_X86)

#include <stddef.h> // size_t
#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t

#ifdef __SSE2__
#include <immintrin.h>
#endif // __SSE2__

#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar

// Fixed-size Vector Operations
// ----------------------------

namespace __llvm_libc {
namespace x86 {

#ifdef __SSE2__
template <typename Base> struct Vector : public Base {
static void copy(char *__restrict dst, const char *__restrict src) {
Base::store(dst, Base::load(src));
}

static void move(char *dst, const char *src) {
Base::store(dst, Base::load(src));
}

static bool equals(const char *a, const char *b) {
return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0;
}

static int three_way_compare(const char *a, const char *b) {
const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b));
if (!mask)
return 0;
return char_diff(a, b, mask);
}

static void splat_set(char *dst, const unsigned char value) {
Base::store(dst, Base::get_splatted_value(value));
}

static int char_diff(const char *a, const char *b, uint64_t mask) {
const size_t diff_index = __builtin_ctzll(mask);
const int ca = (unsigned char)a[diff_index];
const int cb = (unsigned char)b[diff_index];
return ca - cb;
}
};

struct M128 {
static constexpr size_t SIZE = 16;
using T = char __attribute__((__vector_size__(SIZE)));
static uint16_t mask(T value) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return static_cast<uint16_t>(
_mm_movemask_epi8(cpp::bit_cast<__m128i>(value)));
}
static uint16_t not_equal_mask(T a, T b) { return mask(a != b); }
static T load(const char *ptr) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return cpp::bit_cast<T>(
_mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)));
}
static void store(char *ptr, T value) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr),
cpp::bit_cast<__m128i>(value));
}
static T get_splatted_value(const char v) {
const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
return splatted;
}
};

using Vector128 = Vector<M128>; // 16 Bytes

#ifdef __AVX2__
struct M256 {
static constexpr size_t SIZE = 32;
using T = char __attribute__((__vector_size__(SIZE)));
static uint32_t mask(T value) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value));
}
static uint32_t not_equal_mask(T a, T b) { return mask(a != b); }
static T load(const char *ptr) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return cpp::bit_cast<T>(
_mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)));
}
static void store(char *ptr, T value) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
cpp::bit_cast<__m256i>(value));
}
static T get_splatted_value(const char v) {
const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
return splatted;
}
};

using Vector256 = Vector<M256>; // 32 Bytes

#if defined(__AVX512F__) and defined(__AVX512BW__)
struct M512 {
static constexpr size_t SIZE = 64;
using T = char __attribute__((__vector_size__(SIZE)));
static uint64_t not_equal_mask(T a, T b) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a),
cpp::bit_cast<__m512i>(b));
}
static T load(const char *ptr) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return cpp::bit_cast<T>(_mm512_loadu_epi8(ptr));
}
static void store(char *ptr, T value) {
// NOLINTNEXTLINE(llvmlibc-callee-namespace)
return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value));
}
static T get_splatted_value(const char v) {
const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
return splatted;
}
};
using Vector512 = Vector<M512>;

#endif // defined(__AVX512F__) and defined(__AVX512BW__)
#endif // __AVX2__
#endif // __SSE2__

using _1 = __llvm_libc::scalar::_1;
using _2 = __llvm_libc::scalar::_2;
using _3 = __llvm_libc::scalar::_3;
using _4 = __llvm_libc::scalar::_4;
using _8 = __llvm_libc::scalar::_8;
#if defined(__AVX512F__) && defined(__AVX512BW__)
using _16 = __llvm_libc::x86::Vector128;
using _32 = __llvm_libc::x86::Vector256;
using _64 = __llvm_libc::x86::Vector512;
using _128 = __llvm_libc::Repeated<_64, 2>;
#elif defined(__AVX2__)
using _16 = __llvm_libc::x86::Vector128;
using _32 = __llvm_libc::x86::Vector256;
using _64 = __llvm_libc::Repeated<_32, 2>;
using _128 = __llvm_libc::Repeated<_32, 4>;
#elif defined(__SSE2__)
using _16 = __llvm_libc::x86::Vector128;
using _32 = __llvm_libc::Repeated<_16, 2>;
using _64 = __llvm_libc::Repeated<_16, 4>;
using _128 = __llvm_libc::Repeated<_16, 8>;
#else
using _16 = __llvm_libc::Repeated<_8, 2>;
using _32 = __llvm_libc::Repeated<_8, 4>;
using _64 = __llvm_libc::Repeated<_8, 8>;
using _128 = __llvm_libc::Repeated<_8, 16>;
#endif

struct Accelerator {
static void copy(char *dst, const char *src, size_t count) {
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
}
};

} // namespace x86
} // namespace __llvm_libc

#endif // defined(LLVM_LIBC_ARCH_X86)

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
162 changes: 67 additions & 95 deletions libc/src/string/memory_utils/memcmp_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,120 +11,92 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"
#include "src/string/memory_utils/elements.h"

#include <stddef.h> // size_t

namespace __llvm_libc {

static inline MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2,
size_t count) {
if (unlikely(count >= 384)) {
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
}

static inline int inline_memcmp(const char *lhs, const char *rhs,
size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
size_t count) {
if (unlikely(count >= 384)) {
if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
}

static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 384)) {
if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
return value;
align_to_next_boundary<32, Arg::P1>(p1, p2, count);
}
return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
}

static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
size_t count) {
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::x86;
if (count == 0)
return 0;
if (count == 1)
return three_way_compare<_1>(lhs, rhs);
if (count == 2)
return three_way_compare<_2>(lhs, rhs);
if (count == 3)
return three_way_compare<_3>(lhs, rhs);
if (count <= 8)
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count <= 16)
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
if (count <= 128)
return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 384)) {
if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
static inline MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2,
size_t count) {
if (unlikely(count >= 128)) { // [128, ∞]
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
}
return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::aarch64;
if (count == 0) // [0, 0]
return 0;
if (count == 1) // [1, 1]
return three_way_compare<_1>(lhs, rhs);
if (count == 2) // [2, 2]
return three_way_compare<_2>(lhs, rhs);
if (count == 3) // [3, 3]
return three_way_compare<_3>(lhs, rhs);
if (count < 8) // [4, 7]
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count < 16) // [8, 15]
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (unlikely(count >= 128)) // [128, ∞]
return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
if (!equals<_16>(lhs, rhs)) // [16, 16]
return three_way_compare<_16>(lhs, rhs);
if (count < 32) // [17, 31]
return generic::Memcmp<16>::tail(p1, p2, count);
if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
return three_way_compare<Tail<_16>>(lhs, rhs, count);
if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
if (count < 64) // [33, 63]
return generic::Memcmp<32>::tail(p1, p2, count);
return three_way_compare<Tail<_32>>(lhs, rhs, count);
// [64, 127]
return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)
return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::scalar;

static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
if (count == 0)
return MemcmpReturnType::ZERO();
return 0;
if (count == 1)
return generic::Memcmp<1>::block(p1, p2);
return three_way_compare<_1>(lhs, rhs);
if (count == 2)
return generic::Memcmp<2>::block(p1, p2);
return three_way_compare<_2>(lhs, rhs);
if (count == 3)
return generic::Memcmp<3>::block(p1, p2);
return three_way_compare<_3>(lhs, rhs);
if (count <= 8)
return generic::Memcmp<4>::head_tail(p1, p2, count);
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count <= 16)
return generic::Memcmp<8>::head_tail(p1, p2, count);
#if defined(LLVM_LIBC_ARCH_X86)
if constexpr (x86::kAvx512BW)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_memcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
if constexpr (aarch64::kNeon)
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
#else
return inline_memcmp_generic_gt16(p1, p2, count);
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (count <= 32)
return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
if (count <= 64)
return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
if (count <= 128)
return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
#endif
}

Expand Down
112 changes: 58 additions & 54 deletions libc/src/string/memory_utils/memcpy_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t
Expand Down Expand Up @@ -46,103 +45,108 @@ static inline void inline_memcpy(char *__restrict dst,
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////

// Whether to use rep;movsb exclusively, not at all, or only above a certain
// threshold.
// TODO: Use only a single preprocessor definition to simplify the code.
#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
#endif

static constexpr bool kUseOnlyRepMovsb =
// Whether to use only rep;movsb.
constexpr bool USE_ONLY_REP_MOVSB =
LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
static constexpr size_t kRepMovsbThreshold =

// kRepMovsBSize == -1 : Only CopyAligned is used.
// kRepMovsBSize == 0 : Only RepMovsb is used.
// else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
constexpr size_t REP_MOVS_B_SIZE =
#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
#else
-1;
#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE

// Whether target supports AVX instructions.
constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);

#if defined(__AVX__)
using LoopBlockSize = _64;
#else
using LoopBlockSize = _32;
#endif

if constexpr (kUseOnlyRepMovsb)
return x86::Memcpy::repmovsb(dst, src, count);
if (USE_ONLY_REP_MOVSB)
return copy<x86::Accelerator>(dst, src, count);

if (count == 0)
return;
if (count == 1)
return Memcpy<1>::block(dst, src);
return copy<_1>(dst, src);
if (count == 2)
return Memcpy<2>::block(dst, src);
return copy<_2>(dst, src);
if (count == 3)
return Memcpy<3>::block(dst, src);
return copy<_3>(dst, src);
if (count == 4)
return Memcpy<4>::block(dst, src);
return copy<_4>(dst, src);
if (count < 8)
return Memcpy<4>::head_tail(dst, src, count);
return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
return Memcpy<8>::head_tail(dst, src, count);
return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
return Memcpy<16>::head_tail(dst, src, count);
return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
return Memcpy<32>::head_tail(dst, src, count);
return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
return Memcpy<64>::head_tail(dst, src, count);
if (x86::kAvx && count < 256)
return Memcpy<128>::head_tail(dst, src, count);
if (count <= kRepMovsbThreshold) {
Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
return Memcpy < x86::kAvx ? 64 : 32 > ::loop_and_tail(dst, src, count);
}
return x86::Memcpy::repmovsb(dst, src, count);
return copy<HeadTail<_64>>(dst, src, count);
if (HAS_AVX && count < 256)
return copy<HeadTail<_128>>(dst, src, count);
if (count <= REP_MOVS_B_SIZE)
return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
count);
return copy<x86::Accelerator>(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
if (count == 0)
return;
if (count == 1)
return Memcpy<1>::block(dst, src);
return copy<_1>(dst, src);
if (count == 2)
return Memcpy<2>::block(dst, src);
return copy<_2>(dst, src);
if (count == 3)
return Memcpy<3>::block(dst, src);
return copy<_3>(dst, src);
if (count == 4)
return Memcpy<4>::block(dst, src);
return copy<_4>(dst, src);
if (count < 8)
return Memcpy<4>::head_tail(dst, src, count);
return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
return Memcpy<8>::head_tail(dst, src, count);
return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
return Memcpy<16>::head_tail(dst, src, count);
return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
return Memcpy<32>::head_tail(dst, src, count);
return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
return Memcpy<64>::head_tail(dst, src, count);
Memcpy<16>::block(dst, src);
align_to_next_boundary<16, Arg::Src>(dst, src, count);
return Memcpy<64>::loop_and_tail(dst, src, count);
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
if (count == 0)
return;
if (count == 1)
return Memcpy<1>::block(dst, src);
return copy<_1>(dst, src);
if (count == 2)
return Memcpy<2>::block(dst, src);
return copy<_2>(dst, src);
if (count == 3)
return Memcpy<3>::block(dst, src);
return copy<_3>(dst, src);
if (count == 4)
return Memcpy<4>::block(dst, src);
return copy<_4>(dst, src);
if (count < 8)
return Memcpy<4>::head_tail(dst, src, count);
return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
return Memcpy<8>::head_tail(dst, src, count);
return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
return Memcpy<16>::head_tail(dst, src, count);
return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
return Memcpy<32>::head_tail(dst, src, count);
return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
return Memcpy<64>::head_tail(dst, src, count);
Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Src>(dst, src, count);
return Memcpy<32>::loop_and_tail(dst, src, count);
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
#endif
}

Expand Down
90 changes: 38 additions & 52 deletions libc/src/string/memory_utils/memset_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H

#include "src/__support/architectures.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t
Expand Down Expand Up @@ -50,100 +48,88 @@ namespace __llvm_libc {
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
inline static void inline_memset(char *dst, unsigned char value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
static constexpr size_t kMaxSize = x86::kAvx512F ? 64
: x86::kAvx ? 32
: x86::kSse2 ? 16
: 8;
using namespace __llvm_libc::x86;
if (count == 0)
return;
if (count == 1)
return generic::Memset<1, kMaxSize>::block(dst, value);
return splat_set<_1>(dst, value);
if (count == 2)
return generic::Memset<2, kMaxSize>::block(dst, value);
return splat_set<_2>(dst, value);
if (count == 3)
return generic::Memset<3, kMaxSize>::block(dst, value);
return splat_set<_3>(dst, value);
if (count <= 8)
return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= 64)
return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_32>>(dst, value, count);
if (count <= 128)
return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
// Aligned loop
generic::Memset<32, kMaxSize>::block(dst, value);
align_to_next_boundary<32>(dst, count);
return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
return splat_set<HeadTail<_64>>(dst, value, count);
return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
using namespace __llvm_libc::aarch64_memset;
if (count == 0)
return;
if (count <= 3) {
generic::Memset<1, kMaxSize>::block(dst, value);
splat_set<_1>(dst, value);
if (count > 1)
generic::Memset<2, kMaxSize>::tail(dst, value, count);
splat_set<Tail<_2>>(dst, value, count);
return;
}
if (count <= 8)
return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= (32 + 64)) {
generic::Memset<32, kMaxSize>::block(dst, value);
splat_set<_32>(dst, value);
if (count <= 64)
return generic::Memset<32, kMaxSize>::tail(dst, value, count);
generic::Memset<32, kMaxSize>::block(dst + 32, value);
generic::Memset<32, kMaxSize>::tail(dst, value, count);
return splat_set<Tail<_32>>(dst, value, count);
splat_set<Skip<32>::Then<_32>>(dst, value);
splat_set<Tail<_32>>(dst, value, count);
return;
}
if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
generic::Memset<64, kMaxSize>::block(dst, 0);
align_to_next_boundary<64>(dst, count);
return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
} else {
generic::Memset<16, kMaxSize>::block(dst, value);
align_to_next_boundary<16>(dst, count);
return generic::Memset<64, kMaxSize>::loop_and_tail(dst, value, count);
}
if (count >= 448 && value == 0 && hasZva())
return splat_set<Align<_64, Arg::_1>::Then<Loop<Zva64, _64>>>(dst, 0,
count);
else
return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
static constexpr size_t kMaxSize = 8;
using namespace ::__llvm_libc::scalar;

if (count == 0)
return;
if (count == 1)
return generic::Memset<1, kMaxSize>::block(dst, value);
return splat_set<_1>(dst, value);
if (count == 2)
return generic::Memset<2, kMaxSize>::block(dst, value);
return splat_set<_2>(dst, value);
if (count == 3)
return generic::Memset<3, kMaxSize>::block(dst, value);
return splat_set<_3>(dst, value);
if (count <= 8)
return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= 64)
return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
return splat_set<HeadTail<_32>>(dst, value, count);
if (count <= 128)
return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
// Aligned loop
generic::Memset<32, kMaxSize>::block(dst, value);
align_to_next_boundary<32>(dst, count);
return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
return splat_set<HeadTail<_64>>(dst, value, count);
return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
#endif
}

Expand Down
172 changes: 0 additions & 172 deletions libc/src/string/memory_utils/op_aarch64.h

This file was deleted.

146 changes: 0 additions & 146 deletions libc/src/string/memory_utils/op_builtin.h

This file was deleted.

461 changes: 0 additions & 461 deletions libc/src/string/memory_utils/op_generic.h

This file was deleted.

217 changes: 0 additions & 217 deletions libc/src/string/memory_utils/op_x86.h

This file was deleted.

139 changes: 51 additions & 88 deletions libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,19 @@
#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
#define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H

#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/architectures.h"

// Cache line sizes for ARM: These values are not strictly correct since
// cache line sizes depend on implementations, not architectures. There
// are even implementations with cache line sizes configurable at boot
// time.
#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
#define LLVM_LIBC_CACHELINE_SIZE 64
#elif defined(LLVM_LIBC_ARCH_ARM)
#define LLVM_LIBC_CACHELINE_SIZE 32
#else
#error "Unsupported platform for memory functions."
#endif

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
Expand Down Expand Up @@ -51,46 +62,32 @@ static constexpr size_t ge_power2(size_t value) {
return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}

// Returns the number of bytes to substract from ptr to get to the previous
// multiple of alignment. If ptr is already aligned returns 0.
template <size_t alignment> uintptr_t distance_to_align_down(const void *ptr) {
template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns 0.
template <size_t alignment> uintptr_t distance_to_align_up(const void *ptr) {
template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
// The logic is not straightforward and involves unsigned modulo arithmetic
// but the generated code is as fast as it can be.
return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns alignment.
template <size_t alignment>
uintptr_t distance_to_next_aligned(const void *ptr) {
return alignment - distance_to_align_down<alignment>(ptr);
// Returns the offset from `ptr` to the next cache line.
static inline intptr_t offset_to_next_cache_line(const void *ptr) {
return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
}

// Returns the same pointer but notifies the compiler that it is aligned.
template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
}

#if defined __has_builtin
#if __has_builtin(__builtin_memcpy_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
#endif

#if defined __has_builtin
#if __has_builtin(__builtin_memset_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
#endif
#endif

// Performs a constant count copy.
template <size_t Size>
static inline void memcpy_inline(void *__restrict dst,
Expand All @@ -106,56 +103,28 @@ static inline void memcpy_inline(void *__restrict dst,
using Ptr = char *; // Pointer to raw data.
using CPtr = const char *; // Const pointer to raw data.

// This type makes sure that we don't accidentally promote an integral type to
// another one. It is only constructible from the exact T type.
template <typename T> struct StrictIntegralType {
static_assert(cpp::is_integral_v<T>);

// Can only be constructed from a T.
template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
StrictIntegralType(U value) : value(value) {}

// Allows using the type in an if statement.
explicit operator bool() const { return value; }

// If type is unsigned (bcmp) we allow bitwise OR operations.
StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
static_assert(!cpp::is_signed_v<T>);
return value | Rhs.value;
}

// For interation with the C API we allow explicit conversion back to the
// `int` type.
explicit operator int() const {
// bit_cast makes sure that T and int have the same size.
return cpp::bit_cast<int>(value);
}

// Helper to get the zero value.
static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }

private:
T value;
};

using MemcmpReturnType = StrictIntegralType<int32_t>;
using BcmpReturnType = StrictIntegralType<uint32_t>;

// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
// Loads bytes from memory (possibly unaligned) and materializes them as type.
template <typename T> static inline T load(CPtr ptr) {
T Out;
memcpy_inline<sizeof(T)>(&Out, ptr);
return Out;
}

// Stores a value of type T in memory (possibly unaligned).
// Stores a value of type T in memory (possibly unaligned)
template <typename T> static inline void store(Ptr ptr, T value) {
memcpy_inline<sizeof(T)>(ptr, &value);
}

// Advances the pointers p1 and p2 by offset bytes and decrease count by the
// same amount.
// For an operation like memset that operates on a pointer and a count, advances
// the pointer by offset bytes and decrease count by the same amount.
static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) {
ptr += offset;
count -= offset;
}

// For an operation like memcpy or memcmp that operates on two pointers and a
// count, advances the pointers by offset bytes and decrease count by the same
// amount.
template <typename T1, typename T2>
static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
T2 *__restrict &p2, size_t &count) {
Expand All @@ -164,37 +133,31 @@ static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
count -= offset;
}

// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
// and decrease count by the same amount.
// For an operation like memset that operates on a pointer and a count, advances
// the pointer so it is aligned to SIZE bytes and decrease count by the same
// amount.
// We make sure the compiler knows about the adjusted pointer alignment.
template <size_t SIZE, typename T1, typename T2>
void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
size_t &count) {
adjust(distance_to_next_aligned<SIZE>(p1), p1, p2, count);
p1 = assume_aligned<SIZE>(p1);
}

// Same as align_p1_to_next_boundary above but with a single pointer instead.
template <size_t SIZE, typename T1>
void align_to_next_boundary(T1 *&p1, size_t &count) {
CPtr dummy;
align_p1_to_next_boundary<SIZE>(p1, dummy, count);
template <size_t SIZE> void align(Ptr &ptr, size_t &count) {
adjust(offset_to_next_aligned<SIZE>(ptr), ptr, count);
ptr = assume_aligned<SIZE>(ptr);
}

// An enum class that discriminates between the first and second pointer.
enum class Arg { P1, P2, Dst = P1, Src = P2 };

// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
// Precondition: &p1 != &p2
// For an operation like memcpy or memcmp that operates on two pointers and a
// count, advances the pointers so one of them gets aligned to SIZE bytes and
// decrease count by the same amount.
// We make sure the compiler knows about the adjusted pointer alignment.
enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
size_t &count) {
if constexpr (AlignOn == Arg::P1)
align_p1_to_next_boundary<SIZE>(p1, p2, count);
else if constexpr (AlignOn == Arg::P2)
align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
else
deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) {
if constexpr (AlignOn == Arg::_1) {
adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
p1 = assume_aligned<SIZE>(p1);
} else if constexpr (AlignOn == Arg::_2) {
adjust(offset_to_next_aligned<SIZE>(p2), p1, p2, count);
p2 = assume_aligned<SIZE>(p2);
} else {
deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2");
}
}

} // namespace __llvm_libc
Expand Down
4 changes: 2 additions & 2 deletions libc/src/string/memset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
namespace __llvm_libc {

LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
inline_memset(reinterpret_cast<char *>(dst), static_cast<uint8_t>(value),
count);
inline_memset(reinterpret_cast<char *>(dst),
static_cast<unsigned char>(value), count);
return dst;
}

Expand Down
12 changes: 6 additions & 6 deletions libc/test/src/string/bcmp_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,25 @@
TEST(LlvmLibcBcmpTest, CmpZeroByte) {
const char *lhs = "ab";
const char *rhs = "bc";
ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
}

TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
const char *lhs = "ab";
const char *rhs = "ab";
ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
const char *lhs = "ab";
const char *rhs = "ac";
ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
const char *lhs = "ac";
const char *rhs = "ab";
ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcBcmpTest, Sweep) {
Expand All @@ -46,13 +46,13 @@ TEST(LlvmLibcBcmpTest, Sweep) {
reset(lhs);
reset(rhs);
for (size_t i = 0; i < K_MAX_SIZE; ++i)
ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);

reset(lhs);
reset(rhs);
for (size_t i = 0; i < K_MAX_SIZE; ++i) {
rhs[i] = 'b';
ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
rhs[i] = 'a';
}
}
14 changes: 7 additions & 7 deletions libc/test/src/string/memmove_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ TEST(LlvmLibcMemmoveTest, MoveZeroByte) {
void *const Dst = Buffer;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
Expand All @@ -29,7 +29,7 @@ TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
void *const Dst = Buffer;
void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
Expand All @@ -40,7 +40,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
void *const Dst = Buffer + 1;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
Expand All @@ -49,7 +49,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
void *const Dst = Buffer + 2;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}

// e.g. `Dst` follow `src`.
Expand All @@ -62,7 +62,7 @@ TEST(LlvmLibcMemmoveTest, SrcFollowDst) {
void *const Dst = Buffer + 1;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}

TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
Expand All @@ -71,7 +71,7 @@ TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
void *const Dst = Buffer + 2;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}

static constexpr int kMaxSize = 512;
Expand Down Expand Up @@ -106,7 +106,7 @@ TEST(LlvmLibcMemmoveTest, Thorough) {
void *const Ret =
__llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size);
EXPECT_EQ(Ret, Dst);
ASSERT_MEM_EQ(Buffer, Expected);
EXPECT_MEM_EQ(Buffer, Expected);
}
}
}
2 changes: 2 additions & 0 deletions libc/test/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ add_libc_unittest(
SUITE
libc_string_unittests
SRCS
elements_test.cpp
memory_access_test.cpp
utils_test.cpp
COMPILE_OPTIONS
${LIBC_COMPILE_OPTIONS_NATIVE}
Expand Down
137 changes: 137 additions & 0 deletions libc/test/src/string/memory_utils/elements_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
//===-- Unittests for memory_utils ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/CPP/array.h"
#include "src/__support/CPP/span.h"
#include "src/string/memory_utils/elements.h"
#include "utils/UnitTest/Test.h"

namespace __llvm_libc {

// Registering Types
using FixedSizeTypes = testing::TypeList<
#if defined(__SSE2__)
x86::Vector128, //
#endif // __SSE2__
#if defined(__AVX2__)
x86::Vector256, //
#endif // __AVX2__
#if defined(__AVX512F__) and defined(__AVX512BW__)
x86::Vector512, //
#endif // defined(__AVX512F__) and defined(__AVX512BW__)
scalar::UINT8, //
scalar::UINT16, //
scalar::UINT32, //
scalar::UINT64, //
Repeated<scalar::UINT64, 2>, //
Repeated<scalar::UINT64, 4>, //
Repeated<scalar::UINT64, 8>, //
Repeated<scalar::UINT64, 16>, //
Repeated<scalar::UINT64, 32>, //
Chained<scalar::UINT16, scalar::UINT8>, //
Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
builtin::_1, //
builtin::_2, //
builtin::_3, //
builtin::_4, //
builtin::_8 //
>;

char GetRandomChar() {
static constexpr const uint64_t a = 1103515245;
static constexpr const uint64_t c = 12345;
static constexpr const uint64_t m = 1ULL << 31;
static uint64_t seed = 123456789;
seed = (a * seed + c) % m;
return seed;
}

void Randomize(cpp::span<char> buffer) {
for (auto &current : buffer)
current = GetRandomChar();
}

template <typename Element> using Buffer = cpp::array<char, Element::SIZE>;

template <typename Element> Buffer<Element> GetRandomBuffer() {
Buffer<Element> buffer;
Randomize(buffer);
return buffer;
}

TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) {
Buffer<ParamType> Dst;
const auto buffer = GetRandomBuffer<ParamType>();
copy<ParamType>(Dst.data(), buffer.data());
for (size_t i = 0; i < ParamType::SIZE; ++i)
EXPECT_EQ(Dst[i], buffer[i]);
}

template <typename T> T copy(const T &Input) {
T Output;
for (size_t I = 0; I < Input.size(); ++I)
Output[I] = Input[I];
return Output;
}

TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) {
constexpr size_t SIZE = ParamType::SIZE;
using LargeBuffer = cpp::array<char, SIZE * 2>;
LargeBuffer GroundTruth;
Randomize(GroundTruth);
// Forward, we move the SIZE first bytes from offset 0 to SIZE.
for (size_t Offset = 0; Offset < SIZE; ++Offset) {
LargeBuffer Buffer = copy(GroundTruth);
move<ParamType>(&Buffer[Offset], &Buffer[0]);
for (size_t I = 0; I < SIZE; ++I)
EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]);
}
// Backward, we move the SIZE last bytes from offset 0 to SIZE.
for (size_t Offset = 0; Offset < SIZE; ++Offset) {
LargeBuffer Buffer = copy(GroundTruth);
move<ParamType>(&Buffer[Offset], &Buffer[SIZE]);
for (size_t I = 0; I < SIZE; ++I)
EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]);
}
}

TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
const auto buffer = GetRandomBuffer<ParamType>();
EXPECT_TRUE(equals<ParamType>(buffer.data(), buffer.data()));
}

TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) {
Buffer<ParamType> initial;
for (auto &c : initial)
c = 5;

// Testing equality
EXPECT_EQ(three_way_compare<ParamType>(initial.data(), initial.data()), 0);

// Testing all mismatching positions
for (size_t i = 0; i < ParamType::SIZE; ++i) {
auto copy = initial;
++copy[i]; // copy is now lexicographycally greated than initial
const auto *less = initial.data();
const auto *greater = copy.data();
EXPECT_LT(three_way_compare<ParamType>(less, greater), 0);
EXPECT_GT(three_way_compare<ParamType>(greater, less), 0);
}
}

TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
Buffer<ParamType> Dst;
const cpp::array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
for (char value : values) {
splat_set<ParamType>(Dst.data(), value);
for (size_t i = 0; i < ParamType::SIZE; ++i)
EXPECT_EQ(Dst[i], value);
}
}

} // namespace __llvm_libc
228 changes: 228 additions & 0 deletions libc/test/src/string/memory_utils/memory_access_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
//===-- Unittests for memory_utils ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#define LLVM_LIBC_UNITTEST_OBSERVE 1

#include "src/__support/CPP/array.h"
#include "src/string/memory_utils/elements.h"
#include "utils/UnitTest/Test.h"

#include <stdio.h>
#include <string.h>

namespace __llvm_libc {

static constexpr const size_t kMaxBuffer = 32;

struct BufferAccess : cpp::array<char, kMaxBuffer + 1> {
BufferAccess() { Reset(); }
void Reset() {
for (auto &value : *this)
value = '0';
this->operator[](kMaxBuffer) = '\0';
}
void Touch(ptrdiff_t offset, size_t size) {
if (offset < 0)
return;
for (size_t i = 0; i < size; ++i)
++(*this)[offset + i];
}
operator const char *() const { return this->data(); }
};

struct Buffer {
ptrdiff_t Offset(const char *ptr) const {
const bool contained = ptr >= data.begin() && ptr < data.end();
return contained ? ptr - data.begin() : -1;
}
void Reset() {
reads.Reset();
writes.Reset();
}
cpp::array<char, kMaxBuffer> data;
BufferAccess __attribute__((aligned(64))) reads;
BufferAccess __attribute__((aligned(64))) writes;
};

struct MemoryAccessObserver {
void ObserveRead(const char *ptr, size_t size) {
Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
}

void ObserveWrite(const char *ptr, size_t size) {
Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
}

void Reset() {
Buffer1.Reset();
Buffer2.Reset();
}

Buffer Buffer1;
Buffer Buffer2;
};

MemoryAccessObserver Observer;

template <size_t Size> struct TestingElement {
static constexpr size_t SIZE = Size;

static void copy(char *__restrict dst, const char *__restrict src) {
Observer.ObserveRead(src, SIZE);
Observer.ObserveWrite(dst, SIZE);
}

static bool equals(const char *lhs, const char *rhs) {
Observer.ObserveRead(lhs, SIZE);
Observer.ObserveRead(rhs, SIZE);
return true;
}

static int three_way_compare(const char *lhs, const char *rhs) {
Observer.ObserveRead(lhs, SIZE);
Observer.ObserveRead(rhs, SIZE);
return 0;
}

static void splat_set(char *dst, const unsigned char value) {
Observer.ObserveWrite(dst, SIZE);
}
};

using Types = testing::TypeList<
TestingElement<1>, // 1 Byte
TestingElement<2>, // 2 Bytes
TestingElement<4>, // 4 Bytes
Repeated<TestingElement<2>, 3>, // 6 Bytes
Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
>;

struct LlvmLibcTestAccessBase : public testing::Test {

template <typename HigherOrder, size_t Size, size_t Offset = 0>
void checkOperations(const BufferAccess &expected) {
static const BufferAccess untouched;

Observer.Reset();
HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
ASSERT_STREQ(src().writes, untouched);
ASSERT_STREQ(dst().reads, untouched);
ASSERT_STREQ(src().reads, expected);
ASSERT_STREQ(dst().writes, expected);
Observer.Reset();
HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
ASSERT_STREQ(lhs().writes, untouched);
ASSERT_STREQ(rhs().writes, untouched);
ASSERT_STREQ(lhs().reads, expected);
ASSERT_STREQ(rhs().reads, expected);
Observer.Reset();
HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset,
Size);
ASSERT_STREQ(lhs().writes, untouched);
ASSERT_STREQ(rhs().writes, untouched);
ASSERT_STREQ(lhs().reads, expected);
ASSERT_STREQ(rhs().reads, expected);
Observer.Reset();
HigherOrder::splat_set(dst_ptr() + Offset, 5, Size);
ASSERT_STREQ(src().reads, untouched);
ASSERT_STREQ(src().writes, untouched);
ASSERT_STREQ(dst().reads, untouched);
ASSERT_STREQ(dst().writes, expected);
}

void checkMaxAccess(const BufferAccess &expected, int max) {
for (size_t i = 0; i < kMaxBuffer; ++i) {
int value = (int)expected[i] - '0';
ASSERT_GE(value, 0);
ASSERT_LE(value, max);
}
}

private:
const Buffer &lhs() const { return Observer.Buffer1; }
const Buffer &rhs() const { return Observer.Buffer2; }
const Buffer &src() const { return Observer.Buffer2; }
const Buffer &dst() const { return Observer.Buffer1; }
Buffer &dst() { return Observer.Buffer1; }

char *dst_ptr() { return dst().data.begin(); }
const char *src_ptr() { return src().data.begin(); }
const char *lhs_ptr() { return lhs().data.begin(); }
const char *rhs_ptr() { return rhs().data.begin(); }
};

template <typename ParamType>
struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {

void TearDown() override {
static constexpr size_t Size = 10;

BufferAccess expected;
expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);

checkMaxAccess(expected, 1);
checkOperations<Tail<ParamType>, Size>(expected);
}
};
TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}

template <typename ParamType>
struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
void TearDown() override {
static constexpr size_t Size = 10;

BufferAccess expected;
expected.Touch(0, ParamType::SIZE);
expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);

checkMaxAccess(expected, 2);
checkOperations<HeadTail<ParamType>, Size>(expected);
}
};
TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}

template <typename ParamType>
struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
void TearDown() override {
static constexpr size_t Size = 20;

BufferAccess expected;
for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE)
expected.Touch(i, ParamType::SIZE);
expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);

checkMaxAccess(expected, 2);
checkOperations<Loop<ParamType>, Size>(expected);
}
};
TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}

template <typename ParamType>
struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
void TearDown() override {
static constexpr size_t Size = 10;
static constexpr size_t Offset = 2;
using AlignmentT = TestingElement<4>;

BufferAccess expected;
expected.Touch(Offset, AlignmentT::SIZE);
expected.Touch(AlignmentT::SIZE, ParamType::SIZE);
expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE);

checkMaxAccess(expected, 3);
checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
Offset>(expected);
checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
Offset>(expected);
}
};
TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}

} // namespace __llvm_libc
79 changes: 53 additions & 26 deletions libc/test/src/string/memory_utils/utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,41 +72,55 @@ TEST(LlvmLibcUtilsTest, GEPowerOf2) {
EXPECT_EQ(ge_power2(i), kExpectedValues[i]);
}

using UINT = uintptr_t;
using I = intptr_t;

// Converts an offset into a pointer.
const void *forge(size_t offset) {
return reinterpret_cast<const void *>(offset);
}

TEST(LlvmLibcUtilsTest, DistanceToNextAligned) {
EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16));
EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15));
EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16));
EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1));
EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16));
TEST(LlvmLibcUtilsTest, OffsetToNextAligned) {
EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0));
EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15));
EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0));
EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1));
EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
}

TEST(LlvmLibcUtilsTest, DistanceToAlignUp) {
EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0));
EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15));
EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0));
EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1));
EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16));
TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) {
EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
}

TEST(LlvmLibcUtilsTest, DistanceToAlignDown) {
EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0));
EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1));
EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0));
EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15));
EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16));
TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) {
EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
EXPECT_EQ(offset_to_next_cache_line(forge(1)),
I(LLVM_LIBC_CACHELINE_SIZE - 1));
EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0));
EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)),
I(1));
}

TEST(LlvmLibcUtilsTest, Adjust1) {
char a;
const size_t base_size = 10;
for (size_t I = -2; I < 2; ++I) {
auto *ptr = &a;
size_t size = base_size;
adjust(I, ptr, size);
EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I));
EXPECT_EQ(size, base_size - I);
}
}

TEST(LlvmLibcUtilsTest, Adjust2) {
char a, b;
const size_t base_size = 10;
for (ptrdiff_t I = -2; I < 2; ++I) {
for (size_t I = -2; I < 2; ++I) {
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
Expand All @@ -117,28 +131,41 @@ TEST(LlvmLibcUtilsTest, Adjust2) {
}
}

TEST(LlvmLibcUtilsTest, Align1) {
char a;
const size_t base_size = 10;
{
auto *ptr = &a;
size_t size = base_size;
align<128>(ptr, size);
EXPECT_TRUE(uintptr_t(ptr) % 128 == 0);
EXPECT_GE(ptr, &a);
EXPECT_EQ(size_t(ptr - &a), base_size - size);
}
}

TEST(LlvmLibcUtilsTest, Align2) {
char a, b;
const size_t base_size = 10;
{
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
align_to_next_boundary<128, Arg::P1>(p1, p2, size);
align<128, Arg::_1>(p1, p2, size);
EXPECT_TRUE(uintptr_t(p1) % 128 == 0);
EXPECT_GT(p1, &a);
EXPECT_GT(p2, &b);
EXPECT_GE(p1, &a);
EXPECT_GE(p2, &b);
EXPECT_EQ(size_t(p1 - &a), base_size - size);
EXPECT_EQ(size_t(p2 - &b), base_size - size);
}
{
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
align_to_next_boundary<128, Arg::P2>(p1, p2, size);
align<128, Arg::_2>(p1, p2, size);
EXPECT_TRUE(uintptr_t(p2) % 128 == 0);
EXPECT_GT(p1, &a);
EXPECT_GT(p2, &b);
EXPECT_GE(p1, &a);
EXPECT_GE(p2, &b);
EXPECT_EQ(size_t(p1 - &a), base_size - size);
EXPECT_EQ(size_t(p2 - &b), base_size - size);
}
Expand Down
9 changes: 3 additions & 6 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -973,10 +973,9 @@ no_sanitize_features = [
cc_library(
name = "string_memory_utils",
hdrs = [
"src/string/memory_utils/op_aarch64.h",
"src/string/memory_utils/op_builtin.h",
"src/string/memory_utils/op_generic.h",
"src/string/memory_utils/op_x86.h",
"src/string/memory_utils/elements.h",
"src/string/memory_utils/elements_aarch64.h",
"src/string/memory_utils/elements_x86.h",
"src/string/memory_utils/utils.h",
],
textual_hdrs = [
Expand All @@ -989,8 +988,6 @@ cc_library(
deps = [
":__support_common",
":__support_cpp_bit",
":__support_cpp_type_traits",
":__support_cpp_array",
":libc_root",
],
)
Expand Down