463 changes: 463 additions & 0 deletions libc/src/string/memory_utils/algorithm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,463 @@
//===-- Algorithms to compose sized memory operations ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Higher order primitives that build upon the SizedOpT facility.
// They constitute the basic blocks for composing memory functions.
// This file defines the following operations:
// - Skip
// - Tail
// - HeadTail
// - Loop
// - Align
//
// See each class for documentation.
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H

#include "src/string/memory_utils/address.h" // Address
#include "src/string/memory_utils/utils.h" // offset_to_next_aligned

#include <stddef.h> // ptrdiff_t

namespace __llvm_libc {

// We are not yet allowed to use asserts in low level memory operations as
// assert itself could depend on them.
// We define this empty macro so we can enable them as soon as possible and keep
// track of invariants.
#define LIBC_ASSERT(COND)

// An operation that allows to skip the specified amount of bytes.
template <ptrdiff_t Bytes> struct Skip {
template <typename NextT> struct Then {
template <typename DstAddrT>
static inline void set(DstAddrT dst, ubyte value) {
static_assert(NextT::IS_FIXED_SIZE);
NextT::set(offsetAddr<Bytes>(dst), value);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2) {
static_assert(NextT::IS_FIXED_SIZE);
return NextT::isDifferent(offsetAddr<Bytes>(src1),
offsetAddr<Bytes>(src2));
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2) {
static_assert(NextT::IS_FIXED_SIZE);
return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
offsetAddr<Bytes>(src2));
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
static_assert(NextT::IS_RUNTIME_SIZE);
return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
offsetAddr<Bytes>(src2), runtime_size - Bytes);
}
};
};

// Compute the address of a tail operation.
// Because of the runtime size, we loose the alignment information.
template <size_t Size, typename AddrT>
static auto tailAddr(AddrT addr, size_t runtime_size) {
static_assert(IsAddressType<AddrT>::Value);
return offsetAddrAssumeAligned<1>(addr, runtime_size - Size);
}

// Perform the operation on the last 'Size' bytes of the buffer.
//
// e.g. with
// [1234567812345678123]
// [__XXXXXXXXXXXXXX___]
// [________XXXXXXXX___]
//
// Precondition: `runtime_size >= Size`.
template <typename SizedOpT> struct Tail {
static_assert(SizedOpT::IS_FIXED_SIZE);
static constexpr bool IS_RUNTIME_SIZE = true;
static constexpr size_t SIZE = SizedOpT::SIZE;

template <typename DstAddrT, typename SrcAddrT>
static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
SizedOpT::copy(tailAddr<SIZE>(dst, runtime_size),
tailAddr<SIZE>(src, runtime_size));
}

template <typename DstAddrT, typename SrcAddrT>
static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
SizedOpT::move(tailAddr<SIZE>(dst, runtime_size),
tailAddr<SIZE>(src, runtime_size));
}

template <typename DstAddrT>
static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
SizedOpT::set(tailAddr<SIZE>(dst, runtime_size), value);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
return SizedOpT::isDifferent(tailAddr<SIZE>(src1, runtime_size),
tailAddr<SIZE>(src2, runtime_size));
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
return SizedOpT::threeWayCmp(tailAddr<SIZE>(src1, runtime_size),
tailAddr<SIZE>(src2, runtime_size));
}
};

// Perform the operation on the first and the last `SizedOpT::Size` bytes of the
// buffer. This is useful for overlapping operations.
//
// e.g. with
// [1234567812345678123]
// [__XXXXXXXXXXXXXX___]
// [__XXXXXXXX_________]
// [________XXXXXXXX___]
//
// Precondition: `runtime_size >= Size && runtime_size <= 2 x Size`.
template <typename SizedOpT> struct HeadTail {
static_assert(SizedOpT::IS_FIXED_SIZE);
static constexpr bool IS_RUNTIME_SIZE = true;

template <typename DstAddrT, typename SrcAddrT>
static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
SizedOpT::copy(dst, src);
Tail<SizedOpT>::copy(dst, src, runtime_size);
}

template <typename DstAddrT, typename SrcAddrT>
static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
// The load and store operations can be performed in any order as long as
// they are not interleaved. More investigations are needed to determine the
// best order.
auto head = SizedOpT::load(src);
auto tail = SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail);
SizedOpT::store(dst, head);
}

template <typename DstAddrT>
static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
SizedOpT::set(dst, value);
Tail<SizedOpT>::set(dst, value, runtime_size);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
// Two strategies can be applied here:
// 1. Compute head and tail and compose them with a bitwise or operation.
// 2. Stop early if head is different.
// We chose the later because HeadTail operations are typically performed
// with sizes ranging from 4 to 256 bytes. The cost of the loads is then
// significantly larger than the cost of the branch.
if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
return res;
return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
LIBC_ASSERT(runtime_size >= SizedOpT::SIZE &&
runtime_size <= 2 * SizedOpT::SIZE);
if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
return res;
return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
}
};

// Simple loop ending with a Tail operation.
//
// e.g. with
// [12345678123456781234567812345678]
// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
// [__XXXXXXXX_______________________]
// [__________XXXXXXXX_______________]
// [__________________XXXXXXXX_______]
// [______________________XXXXXXXX___]
//
// Precondition:
// - runtime_size >= Size
template <typename SizedOpT> struct Loop {
static_assert(SizedOpT::IS_FIXED_SIZE);
static constexpr bool IS_RUNTIME_SIZE = true;
static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;

template <typename DstAddrT, typename SrcAddrT>
static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
size_t offset = 0;
do {
SizedOpT::copy(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
offset += BLOCK_SIZE;
} while (offset < runtime_size - BLOCK_SIZE);
Tail<SizedOpT>::copy(dst, src, runtime_size);
}

// Move forward suitable when dst < src. We load the tail bytes before
// handling the loop.
//
// e.g. Moving two bytes
// [ | | | | |]
// [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
// [_________________________LLLLLLLL___]
// [___LLLLLLLL_________________________]
// [_SSSSSSSS___________________________]
// [___________LLLLLLLL_________________]
// [_________SSSSSSSS___________________]
// [___________________LLLLLLLL_________]
// [_________________SSSSSSSS___________]
// [_______________________SSSSSSSS_____]
template <typename DstAddrT, typename SrcAddrT>
static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
const auto tail_value =
SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
size_t offset = 0;
do {
SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
offset += BLOCK_SIZE;
} while (offset < runtime_size - BLOCK_SIZE);
SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail_value);
}

// Move backward suitable when dst > src. We load the head bytes before
// handling the loop.
//
// e.g. Moving two bytes
// [ | | | | |]
// [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
// [___LLLLLLLL_________________________]
// [_________________________LLLLLLLL___]
// [___________________________SSSSSSSS_]
// [_________________LLLLLLLL___________]
// [___________________SSSSSSSS_________]
// [_________LLLLLLLL___________________]
// [___________SSSSSSSS_________________]
// [_____SSSSSSSS_______________________]
template <typename DstAddrT, typename SrcAddrT>
static inline void move_backward(DstAddrT dst, SrcAddrT src,
size_t runtime_size) {
const auto head_value = SizedOpT::load(src);
ptrdiff_t offset = runtime_size - BLOCK_SIZE;
do {
SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
offset -= BLOCK_SIZE;
} while (offset >= 0);
SizedOpT::store(dst, head_value);
}

template <typename DstAddrT>
static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
size_t offset = 0;
do {
SizedOpT::set(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset), value);
offset += BLOCK_SIZE;
} while (offset < runtime_size - BLOCK_SIZE);
Tail<SizedOpT>::set(dst, value, runtime_size);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
size_t offset = 0;
do {
if (uint64_t res = SizedOpT::isDifferent(
offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
return res;
offset += BLOCK_SIZE;
} while (offset < runtime_size - BLOCK_SIZE);
return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
size_t offset = 0;
do {
if (int32_t res = SizedOpT::threeWayCmp(
offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
return res;
offset += BLOCK_SIZE;
} while (offset < runtime_size - BLOCK_SIZE);
return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
}
};

// Aligns using a statically-sized operation, then calls the subsequent NextT
// operation.
//
// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
// Align<_16, Arg::Dst>::Then<Loop<_32>>::copy(dst, src, runtime_size);
enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
template <typename SizedOpT, Arg AlignOn = Arg::_1> struct Align {
static_assert(SizedOpT::IS_FIXED_SIZE);

template <typename NextT> struct Then {
static_assert(NextT::IS_RUNTIME_SIZE);

template <typename DstAddrT, typename SrcAddrT>
static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
SizedOpT::copy(dst, src);
auto aligned = align(dst, src, runtime_size);
NextT::copy(aligned.arg1, aligned.arg2, aligned.size);
}

// Move forward suitable when dst < src. The alignment is performed with
// an HeadTail operation of size ∈ [Alignment, 2 x Alignment].
//
// e.g. Moving two bytes and making sure src is then aligned.
// [ | | | | ]
// [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
// [____LLLLLLLL_____________________]
// [___________LLLLLLLL______________]
// [_SSSSSSSS________________________]
// [________SSSSSSSS_________________]
//
// e.g. Moving two bytes and making sure dst is then aligned.
// [ | | | | ]
// [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
// [____LLLLLLLL_____________________]
// [______LLLLLLLL___________________]
// [_SSSSSSSS________________________]
// [___SSSSSSSS______________________]
template <typename DstAddrT, typename SrcAddrT>
static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
auto aligned_after_begin = align(dst, src, runtime_size);
// We move pointers forward by Size so we can perform HeadTail.
auto aligned = aligned_after_begin.stepForward();
HeadTail<SizedOpT>::move(dst, src, runtime_size - aligned.size);
NextT::move(aligned.arg1, aligned.arg2, aligned.size);
}

// Move backward suitable when dst > src. The alignment is performed with
// an HeadTail operation of size ∈ [Alignment, 2 x Alignment].
//
// e.g. Moving two bytes backward and making sure src is then aligned.
// [ | | | | ]
// [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
// [ _________________LLLLLLLL_______]
// [ ___________________LLLLLLLL_____]
// [____________________SSSSSSSS_____]
// [______________________SSSSSSSS___]
//
// e.g. Moving two bytes and making sure dst is then aligned.
// [ | | | | ]
// [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
// [ _______________LLLLLLLL_________]
// [ ___________________LLLLLLLL_____]
// [__________________SSSSSSSS_______]
// [______________________SSSSSSSS___]
template <typename DstAddrT, typename SrcAddrT>
static inline void move_backward(DstAddrT dst, SrcAddrT src,
size_t runtime_size) {
const auto dst_end = offsetAddrAssumeAligned<1>(dst, runtime_size);
const auto src_end = offsetAddrAssumeAligned<1>(src, runtime_size);
auto aligned_after_end = align(dst_end, src_end, 0);
// We move pointers back by 2 x Size so we can perform HeadTail.
auto aligned = aligned_after_end.stepBack().stepBack();
HeadTail<SizedOpT>::move(aligned.arg1, aligned.arg2, aligned.size);
NextT::move_backward(dst, src, runtime_size - aligned.size);
}

template <typename DstAddrT>
static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
SizedOpT::set(dst, value);
DstAddrT _(nullptr);
auto aligned = align(dst, _, runtime_size);
NextT::set(aligned.arg1, value, aligned.size);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
return res;
auto aligned = align(src1, src2, runtime_size);
return NextT::isDifferent(aligned.arg1, aligned.arg2, aligned.size);
}

template <typename SrcAddrT1, typename SrcAddrT2>
static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
size_t runtime_size) {
if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
return res;
auto aligned = align(src1, src2, runtime_size);
return NextT::threeWayCmp(aligned.arg1, aligned.arg2, aligned.size);
}
};

private:
static constexpr size_t ALIGN_OP_SIZE = SizedOpT::SIZE;
static_assert(ALIGN_OP_SIZE > 1);

template <typename Arg1AddrT, typename Arg2AddrT> struct Aligned {
Arg1AddrT arg1;
Arg2AddrT arg2;
size_t size;

Aligned stepForward() const {
return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, ALIGN_OP_SIZE),
offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, ALIGN_OP_SIZE),
size - ALIGN_OP_SIZE};
}

Aligned stepBack() const {
return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, -ALIGN_OP_SIZE),
offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, -ALIGN_OP_SIZE),
size + ALIGN_OP_SIZE};
}
};

template <typename Arg1AddrT, typename Arg2AddrT>
static auto makeAligned(Arg1AddrT arg1, Arg2AddrT arg2, size_t size) {
return Aligned<Arg1AddrT, Arg2AddrT>{arg1, arg2, size};
}

template <typename Arg1AddrT, typename Arg2AddrT>
static auto align(Arg1AddrT arg1, Arg2AddrT arg2, size_t runtime_size) {
static_assert(IsAddressType<Arg1AddrT>::Value);
static_assert(IsAddressType<Arg2AddrT>::Value);
if constexpr (AlignOn == Arg::_1) {
auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg1.ptr_);
return makeAligned(offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg1, offset),
offsetAddrAssumeAligned<1>(arg2, offset),
runtime_size - offset);
} else if constexpr (AlignOn == Arg::_2) {
auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg2.ptr_);
return makeAligned(offsetAddrAssumeAligned<1>(arg1, offset),
offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg2, offset),
runtime_size - offset);
} else {
DeferredStaticAssert("AlignOn must be either Arg::_1 or Arg::_2");
}
}
};

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
3 changes: 3 additions & 0 deletions libc/src/string/memory_utils/sized_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ namespace __llvm_libc {

template <typename Backend, size_t Size> struct SizedOp {
static constexpr size_t SIZE = Size;
// Define instantiations of SizedOp as a fixed size operation.
// i.e. an operation that is composable by types in algorithm.h
static constexpr bool IS_FIXED_SIZE = true;

private:
static_assert(Backend::IS_BACKEND_TYPE);
Expand Down
1 change: 1 addition & 0 deletions libc/test/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_libc_unittest(
libc_string_unittests
SRCS
address_test.cpp
algorithm_test.cpp
backend_test.cpp
elements_test.cpp
memory_access_test.cpp
Expand Down
565 changes: 565 additions & 0 deletions libc/test/src/string/memory_utils/algorithm_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,565 @@

#define LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE 0

#include "utils/UnitTest/Test.h"
#include <src/__support/CPP/Array.h>
#include <src/string/memory_utils/algorithm.h>
#include <src/string/memory_utils/backends.h>

#include <sstream>

namespace __llvm_libc {

struct alignas(64) Buffer : cpp::Array<char, 128> {
bool contains(const char *ptr) const {
return ptr >= data() && ptr < (data() + size());
}
size_t getOffset(const char *ptr) const { return ptr - data(); }
void fill(char c) {
for (auto itr = begin(); itr != end(); ++itr)
*itr = c;
}
};

static Buffer buffer1;
static Buffer buffer2;
static std::ostringstream LOG;

struct TestBackend {
static constexpr bool IS_BACKEND_TYPE = true;

template <typename T> static void log(const char *Action, const char *ptr) {
LOG << Action << "<" << sizeof(T) << "> ";
if (buffer1.contains(ptr))
LOG << "a[" << buffer1.getOffset(ptr) << "]";
else if (buffer2.contains(ptr))
LOG << "b[" << buffer2.getOffset(ptr) << "]";
LOG << "\n";
}

template <typename T, Temporality TS, Aligned AS>
static T load(const T *src) {
log<T>((AS == Aligned::YES ? "LdA" : "LdU"),
reinterpret_cast<const char *>(src));
return Scalar64BitBackend::load<T, TS, AS>(src);
}

template <typename T, Temporality TS, Aligned AS>
static void store(T *dst, T value) {
log<T>((AS == Aligned::YES ? "StA" : "StU"),
reinterpret_cast<const char *>(dst));
Scalar64BitBackend::store<T, TS, AS>(dst, value);
}

template <typename T> static inline T splat(ubyte value) {
LOG << "Splat<" << sizeof(T) << "> " << (unsigned)value << '\n';
return Scalar64BitBackend::splat<T>(value);
}

template <typename T> static inline uint64_t notEquals(T v1, T v2) {
LOG << "Neq<" << sizeof(T) << ">\n";
return Scalar64BitBackend::notEquals<T>(v1, v2);
}

template <typename T> static inline int32_t threeWayCmp(T v1, T v2) {
LOG << "Diff<" << sizeof(T) << ">\n";
return Scalar64BitBackend::threeWayCmp<T>(v1, v2);
}

template <size_t Size>
using getNextType = Scalar64BitBackend::getNextType<Size>;
};

struct LlvmLibcAlgorithm : public testing::Test {
void SetUp() override {
LOG = std::ostringstream();
LOG << '\n';
}

void fillEqual() {
buffer1.fill('a');
buffer2.fill('a');
}

void fillDifferent() {
buffer1.fill('a');
buffer2.fill('b');
}

const char *getTrace() {
trace_ = LOG.str();
return trace_.c_str();
}

const char *stripComments(const char *expected) {
expected_.clear();
std::stringstream ss(expected);
std::string line;
while (std::getline(ss, line, '\n')) {
const auto pos = line.find('#');
if (pos == std::string::npos) {
expected_ += line;
} else {
auto log = line.substr(0, pos);
while (!log.empty() && std::isspace(log.back()))
log.pop_back();
expected_ += log;
}
expected_ += '\n';
}
return expected_.c_str();
}

template <size_t Align = 1> SrcAddr<Align> buf1(size_t offset = 0) const {
return buffer1.data() + offset;
}
template <size_t Align = 1> SrcAddr<Align> buf2(size_t offset = 0) const {
return buffer2.data() + offset;
}
template <size_t Align = 1> DstAddr<Align> dst(size_t offset = 0) const {
return buffer1.data() + offset;
}
template <size_t Align = 1> SrcAddr<Align> src(size_t offset = 0) const {
return buffer2.data() + offset;
}

private:
std::string trace_;
std::string expected_;
};

using _8 = SizedOp<TestBackend, 8>;

///////////////////////////////////////////////////////////////////////////////
//// Testing fixed fized forward operations
///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// Copy

TEST_F(LlvmLibcAlgorithm, copy_1) {
SizedOp<TestBackend, 1>::copy(dst(), src());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<1> b[0]
StU<1> a[0]
)"));
}

TEST_F(LlvmLibcAlgorithm, copy_15) {
SizedOp<TestBackend, 15>::copy(dst(), src());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0]
LdU<4> b[8]
StU<4> a[8]
LdU<2> b[12]
StU<2> a[12]
LdU<1> b[14]
StU<1> a[14]
)"));
}

TEST_F(LlvmLibcAlgorithm, copy_16) {
SizedOp<TestBackend, 16>::copy(dst(), src());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0]
LdU<8> b[8]
StU<8> a[8]
)"));
}
///////////////////////////////////////////////////////////////////////////////
// Move

TEST_F(LlvmLibcAlgorithm, move_1) {
SizedOp<TestBackend, 1>::move(dst(), src());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<1> b[0]
StU<1> a[0]
)"));
}

TEST_F(LlvmLibcAlgorithm, move_15) {
SizedOp<TestBackend, 15>::move(dst(), src());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
LdU<4> b[8]
LdU<2> b[12]
LdU<1> b[14]
StU<1> a[14]
StU<2> a[12]
StU<4> a[8]
StU<8> a[0]
)"));
}

TEST_F(LlvmLibcAlgorithm, move_16) {
SizedOp<TestBackend, 16>::move(dst(), src());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
LdU<8> b[8]
StU<8> a[8]
StU<8> a[0]
)"));
}

///////////////////////////////////////////////////////////////////////////////
// set

TEST_F(LlvmLibcAlgorithm, set_1) {
SizedOp<TestBackend, 1>::set(dst(), ubyte{42});
EXPECT_STREQ(getTrace(), stripComments(R"(
Splat<1> 42
StU<1> a[0]
)"));
}

TEST_F(LlvmLibcAlgorithm, set_15) {
SizedOp<TestBackend, 15>::set(dst(), ubyte{42});
EXPECT_STREQ(getTrace(), stripComments(R"(
Splat<8> 42
StU<8> a[0]
Splat<4> 42
StU<4> a[8]
Splat<2> 42
StU<2> a[12]
Splat<1> 42
StU<1> a[14]
)"));
}

TEST_F(LlvmLibcAlgorithm, set_16) {
SizedOp<TestBackend, 16>::set(dst(), ubyte{42});
EXPECT_STREQ(getTrace(), stripComments(R"(
Splat<8> 42
StU<8> a[0]
Splat<8> 42
StU<8> a[8]
)"));
}

///////////////////////////////////////////////////////////////////////////////
// different

TEST_F(LlvmLibcAlgorithm, different_1) {
fillEqual();
SizedOp<TestBackend, 1>::isDifferent(buf1(), buf2());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<1> a[0]
LdU<1> b[0]
Neq<1>
)"));
}

TEST_F(LlvmLibcAlgorithm, different_15) {
fillEqual();
SizedOp<TestBackend, 15>::isDifferent(buf1(), buf2());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[0]
LdU<8> b[0]
Neq<8>
LdU<4> a[8]
LdU<4> b[8]
Neq<4>
LdU<2> a[12]
LdU<2> b[12]
Neq<2>
LdU<1> a[14]
LdU<1> b[14]
Neq<1>
)"));
}

TEST_F(LlvmLibcAlgorithm, different_15_no_shortcircuit) {
fillDifferent();
SizedOp<TestBackend, 15>::isDifferent(buf1(), buf2());
// If buffer compare isDifferent we continue to aggregate.
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[0]
LdU<8> b[0]
Neq<8>
LdU<4> a[8]
LdU<4> b[8]
Neq<4>
LdU<2> a[12]
LdU<2> b[12]
Neq<2>
LdU<1> a[14]
LdU<1> b[14]
Neq<1>
)"));
}

TEST_F(LlvmLibcAlgorithm, different_16) {
fillEqual();
SizedOp<TestBackend, 16>::isDifferent(buf1(), buf2());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[0]
LdU<8> b[0]
Neq<8>
LdU<8> a[8]
LdU<8> b[8]
Neq<8>
)"));
}

///////////////////////////////////////////////////////////////////////////////
// three_way_cmp

TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_1) {
fillEqual();
SizedOp<TestBackend, 1>::threeWayCmp(buf1(), buf2());
// Buffer compare equal, returning 0 and no call to Diff.
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<1> a[0]
LdU<1> b[0]
Diff<1>
)"));
}

TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_15) {
fillEqual();
SizedOp<TestBackend, 15>::threeWayCmp(buf1(), buf2());
// Buffer compare equal, returning 0 and no call to Diff.
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[0]
LdU<8> b[0]
Diff<8>
LdU<4> a[8]
LdU<4> b[8]
Diff<4>
LdU<2> a[12]
LdU<2> b[12]
Diff<2>
LdU<1> a[14]
LdU<1> b[14]
Diff<1>
)"));
}

TEST_F(LlvmLibcAlgorithm, three_way_cmp_neq_15_shortcircuit) {
fillDifferent();
SizedOp<TestBackend, 16>::threeWayCmp(buf1(), buf2());
// If buffer compare isDifferent we stop early.
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[0]
LdU<8> b[0]
Diff<8>
)"));
}

TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_16) {
fillEqual();
SizedOp<TestBackend, 16>::threeWayCmp(buf1(), buf2());
// Buffer compare equal, returning 0 and no call to Diff.
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[0]
LdU<8> b[0]
Diff<8>
LdU<8> a[8]
LdU<8> b[8]
Diff<8>
)"));
}

///////////////////////////////////////////////////////////////////////////////
//// Testing skip operations
///////////////////////////////////////////////////////////////////////////////

TEST_F(LlvmLibcAlgorithm, skip_and_set) {
Skip<11>::Then<SizedOp<TestBackend, 1>>::set(dst(), ubyte{42});
EXPECT_STREQ(getTrace(), stripComments(R"(
Splat<1> 42
StU<1> a[11]
)"));
}

TEST_F(LlvmLibcAlgorithm, skip_and_different_1) {
Skip<11>::Then<SizedOp<TestBackend, 1>>::isDifferent(buf1(), buf2());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<1> a[11]
LdU<1> b[11]
Neq<1>
)"));
}

TEST_F(LlvmLibcAlgorithm, skip_and_three_way_cmp_8) {
Skip<11>::Then<SizedOp<TestBackend, 1>>::threeWayCmp(buf1(), buf2());
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<1> a[11]
LdU<1> b[11]
Diff<1>
)"));
}

///////////////////////////////////////////////////////////////////////////////
//// Testing tail operations
///////////////////////////////////////////////////////////////////////////////

TEST_F(LlvmLibcAlgorithm, tail_copy_8) {
Tail<_8>::copy(dst(), src(), 16);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[8]
StU<8> a[8]
)"));
}

TEST_F(LlvmLibcAlgorithm, tail_move_8) {
Tail<_8>::move(dst(), src(), 16);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[8]
StU<8> a[8]
)"));
}

TEST_F(LlvmLibcAlgorithm, tail_set_8) {
Tail<_8>::set(dst(), ubyte{42}, 16);
EXPECT_STREQ(getTrace(), stripComments(R"(
Splat<8> 42
StU<8> a[8]
)"));
}

TEST_F(LlvmLibcAlgorithm, tail_different_8) {
fillEqual();
Tail<_8>::isDifferent(buf1(), buf2(), 16);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[8]
LdU<8> b[8]
Neq<8>
)"));
}

TEST_F(LlvmLibcAlgorithm, tail_three_way_cmp_8) {
fillEqual();
Tail<_8>::threeWayCmp(buf1(), buf2(), 16);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> a[8]
LdU<8> b[8]
Diff<8>
)"));
}

///////////////////////////////////////////////////////////////////////////////
//// Testing HeadTail operations
///////////////////////////////////////////////////////////////////////////////

TEST_F(LlvmLibcAlgorithm, head_tail_copy_8) {
HeadTail<_8>::copy(dst(), src(), 16);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0]
LdU<8> b[8]
StU<8> a[8]
)"));
}

///////////////////////////////////////////////////////////////////////////////
//// Testing Loop operations
///////////////////////////////////////////////////////////////////////////////

TEST_F(LlvmLibcAlgorithm, loop_copy_one_iteration_and_tail) {
Loop<_8>::copy(dst(), src(), 10);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0] # covers 0-7
LdU<8> b[2]
StU<8> a[2] # covers 2-9
)"));
}

TEST_F(LlvmLibcAlgorithm, loop_copy_two_iteration_and_tail) {
Loop<_8>::copy(dst(), src(), 17);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0] # covers 0-7
LdU<8> b[8]
StU<8> a[8] # covers 8-15
LdU<8> b[9]
StU<8> a[9] # covers 9-16
)"));
}

TEST_F(LlvmLibcAlgorithm, loop_with_one_turn_is_inefficient_but_ok) {
Loop<_8>::copy(dst(), src(), 8);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0] # first iteration covers 0-7
LdU<8> b[0] # tail also covers 0-7 but since Loop is supposed to be used
StU<8> a[0] # with a sufficient number of iterations the tail cost is amortised
)"));
}

TEST_F(LlvmLibcAlgorithm, loop_with_round_number_of_turn) {
Loop<_8>::copy(dst(), src(), 24);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StU<8> a[0] # first iteration covers 0-7
LdU<8> b[8]
StU<8> a[8] # second iteration covers 8-15
LdU<8> b[16]
StU<8> a[16]
)"));
}

TEST_F(LlvmLibcAlgorithm, dst_aligned_loop) {
Loop<_8>::copy(dst<16>(), src(), 23);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[0]
StA<8> a[0] # store is aligned on 16B
LdU<8> b[8]
StA<8> a[8] # subsequent stores are aligned
LdU<8> b[15]
StU<8> a[15] # Tail is always unaligned
)"));
}

TEST_F(LlvmLibcAlgorithm, aligned_loop) {
Loop<_8>::copy(dst<16>(), src<8>(), 23);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdA<8> b[0] # load is aligned on 8B
StA<8> a[0] # store is aligned on 16B
LdA<8> b[8] # subsequent loads are aligned
StA<8> a[8] # subsequent stores are aligned
LdU<8> b[15] # Tail is always unaligned
StU<8> a[15] # Tail is always unaligned
)"));
}

///////////////////////////////////////////////////////////////////////////////
//// Testing Align operations
///////////////////////////////////////////////////////////////////////////////

TEST_F(LlvmLibcAlgorithm, align_dst_copy_8) {
Align<_8, Arg::Dst>::Then<Loop<_8>>::copy(dst(2), src(3), 31);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[3]
StU<8> a[2] # First store covers unaligned bytes
LdU<8> b[9]
StA<8> a[8] # First aligned store
LdU<8> b[17]
StA<8> a[16] # Subsequent stores are aligned
LdU<8> b[25]
StA<8> a[24] # Subsequent stores are aligned
LdU<8> b[26]
StU<8> a[25] # Last store covers remaining bytes
)"));
}

TEST_F(LlvmLibcAlgorithm, align_src_copy_8) {
Align<_8, Arg::Src>::Then<Loop<_8>>::copy(dst(2), src(3), 31);
EXPECT_STREQ(getTrace(), stripComments(R"(
LdU<8> b[3] # First load covers unaligned bytes
StU<8> a[2]
LdA<8> b[8] # First aligned load
StU<8> a[7]
LdA<8> b[16] # Subsequent loads are aligned
StU<8> a[15]
LdA<8> b[24] # Subsequent loads are aligned
StU<8> a[23]
LdU<8> b[26] # Last load covers remaining bytes
StU<8> a[25]
)"));
}

} // namespace __llvm_libc