Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions External/HIP/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ macro(create_local_hip_tests VariantSuffix)
list(APPEND HIP_LOCAL_TESTS with-fopenmp)
list(APPEND HIP_LOCAL_TESTS saxpy)
list(APPEND HIP_LOCAL_TESTS memmove)
list(APPEND HIP_LOCAL_TESTS memset)
list(APPEND HIP_LOCAL_TESTS split-kernel-args)
list(APPEND HIP_LOCAL_TESTS builtin-logb-scalbn)

Expand Down
1 change: 1 addition & 0 deletions External/HIP/memmove.hip
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <cassert>
#include <cstring>
#include <iostream>
#include <memory>
#include <vector>

#include "hip/hip_runtime.h"
Expand Down
359 changes: 359 additions & 0 deletions External/HIP/memset.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,359 @@
#include <cassert>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <memory>
#include <vector>

#include "hip/hip_runtime.h"

// Tests for the functional correctness of the lowering of memset in device
// code. Various memsets are performed on device side and the result of each is
// compared to the corresponding operation on the host. Global, shared, and
// stack memory is tested.

#define VERBOSE 0

#define CHKHIP(r) \
if (r != hipSuccess) { \
std::cerr << hipGetErrorString(r) << std::endl; \
abort(); \
}

// Maximal number of bytes to set with a memset call, used to allocate
// buffers.
static constexpr size_t MaxBytesPerThread = 2048;

// LDS is small, so run only smaller tests there.
static constexpr size_t MaxLDSBytesPerThread = 128;

// Number of threads started in parallel.
static constexpr size_t NumMoveThreads = 2 * 32;

// Size of blocks in the grid used for threads. If the number of threads is
// smaller than this, it is used instead.
static constexpr size_t BlockSize = 256;

static constexpr size_t AllocSize = 2 * NumMoveThreads * MaxBytesPerThread;

static constexpr size_t LDSAllocSize =
2 * NumMoveThreads * MaxLDSBytesPerThread;

enum AddressSpace {
GLOBAL = 0,
SHARED = 1,
STACK = 2,
};

static const char *as_names[] = {
"global",
"shared",
"stack",
};

static constexpr size_t get_stride(size_t bytes_per_thread) {
return 2 * bytes_per_thread;
}

/// Initialize \p alloc_size bytes of \p buf_device to increasing numbers
/// (modulo 256).
__global__ void init_kernel(uint8_t *buf_device, size_t alloc_size) {
for (size_t i = 0; i < alloc_size; ++i) {
buf_device[i] = (uint8_t)i;
}
}

template <size_t SZ, uint8_t SetVal, bool const_size, bool use_tid,
bool const_setval>
__global__ void memset_kernel_global(uint8_t *buf_device, size_t dst_idx,
uint8_t dyn_setval, size_t dyn_sz) {
(void)dyn_sz;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= NumMoveThreads)
return;
uint8_t *thread_buf = buf_device + get_stride(SZ) * tid;

if constexpr (const_size) {
if constexpr (use_tid) {
__builtin_memset(thread_buf + dst_idx, static_cast<uint8_t>(tid), SZ);
} else if constexpr (const_setval) {
__builtin_memset(thread_buf + dst_idx, SetVal, SZ);
} else {
__builtin_memset(thread_buf + dst_idx, dyn_setval, SZ);
}
} else {
if constexpr (use_tid) {
__builtin_memset(thread_buf + dst_idx, static_cast<uint8_t>(tid), dyn_sz);
} else if constexpr (const_setval) {
__builtin_memset(thread_buf + dst_idx, SetVal, dyn_sz);
} else {
__builtin_memset(thread_buf + dst_idx, dyn_setval, dyn_sz);
}
}
}

template <size_t SZ, uint8_t SetVal, bool const_size, bool use_tid,
bool const_setval>
__global__ void memset_kernel_shared(uint8_t *buf_device, size_t dst_idx,
uint8_t dyn_setval, size_t dyn_sz) {
(void)dyn_sz;
__shared__ uint8_t buf_shared[LDSAllocSize];
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= NumMoveThreads)
return;
constexpr size_t stride = get_stride(SZ);
uint8_t *thread_buf = buf_device + stride * tid;
uint8_t *thread_buf_shared = buf_shared + stride * tid;
// Copy the original data to shared memory.
__builtin_memcpy(thread_buf_shared, thread_buf, stride);

// Perform the memset there.
if constexpr (const_size) {
if constexpr (use_tid) {
__builtin_memset(thread_buf_shared + dst_idx, (uint8_t)tid, SZ);
} else if constexpr (const_setval) {
__builtin_memset(thread_buf_shared + dst_idx, SetVal, SZ);
} else {
__builtin_memset(thread_buf_shared + dst_idx, dyn_setval, SZ);
}
} else {
if constexpr (use_tid) {
__builtin_memset(thread_buf_shared + dst_idx, (uint8_t)tid, dyn_sz);
} else if constexpr (const_setval) {
__builtin_memset(thread_buf_shared + dst_idx, SetVal, dyn_sz);
} else {
__builtin_memset(thread_buf_shared + dst_idx, dyn_setval, dyn_sz);
}
}

// Copy the modified data back to global memory.
__builtin_memcpy(thread_buf, thread_buf_shared, stride);
}

template <size_t SZ, uint8_t SetVal, bool const_size, bool use_tid,
bool const_setval>
__global__ void memset_kernel_stack(uint8_t *buf_device, size_t dst_idx,
uint8_t dyn_setval, size_t dyn_sz) {
(void)dyn_sz;
constexpr size_t stride = get_stride(SZ);
uint8_t buf_stack[stride];
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= NumMoveThreads)
return;
uint8_t *thread_buf = buf_device + stride * tid;
// Copy the original data to the stack.
__builtin_memcpy(buf_stack, thread_buf, stride);

// Perform the memset there.
if constexpr (const_size) {
if constexpr (use_tid) {
__builtin_memset(buf_stack + dst_idx, (uint8_t)tid, SZ);
} else if constexpr (const_setval) {
__builtin_memset(buf_stack + dst_idx, SetVal, SZ);
} else {
__builtin_memset(buf_stack + dst_idx, dyn_setval, SZ);
}
} else {
if constexpr (use_tid) {
__builtin_memset(buf_stack + dst_idx, (uint8_t)tid, dyn_sz);
} else if constexpr (const_setval) {
__builtin_memset(buf_stack + dst_idx, SetVal, dyn_sz);
} else {
__builtin_memset(buf_stack + dst_idx, dyn_setval, dyn_sz);
}
}

// Copy the modified data back to global memory.
__builtin_memcpy(thread_buf, buf_stack, stride);
}

template <size_t SZ, uint8_t SetVal>
bool run_test(uint8_t *buf_reference, uint8_t *buf_host, uint8_t *buf_device,
size_t dst_idx, bool const_size, bool use_tid, bool const_setval,
AddressSpace AS, size_t &differing_pos) {
// Initialize device buffer.
hipLaunchKernelGGL(init_kernel, dim3(1), dim3(1), 0, 0, buf_device,
AllocSize);
CHKHIP(hipDeviceSynchronize());

// Set up the reference buffer.
for (size_t i = 0; i < AllocSize; ++i)
buf_reference[i] = (uint8_t)i;

// Simulate multi-threaded device-side memset on the host.
for (size_t tid = 0; tid < NumMoveThreads; ++tid) {
uint8_t *thread_buf = buf_reference + get_stride(SZ) * tid;
uint8_t v = use_tid ? tid : SetVal;
std::memset(thread_buf + dst_idx, v, SZ);
}

// Do the device-side memset.
int block_size = std::min(BlockSize, NumMoveThreads);
int num_blocks = (NumMoveThreads + block_size - 1) / block_size;

// Select the right kernel with the right template paramters. This is done
// using compile-time constant template parameters so that we can control
// which memset arguments the compiler sees as constant, as this affects code
// generation.
void (*chosen_kernel)(uint8_t *, size_t, uint8_t, size_t) = nullptr;

#define SELECT_KERNEL_FOR_ADDRSPACE(AS) \
if (const_size) { \
if (use_tid) \
chosen_kernel = memset_kernel_##AS<SZ, SetVal, true, true, false>; \
else if (const_setval) \
chosen_kernel = memset_kernel_##AS<SZ, SetVal, true, false, true>; \
else \
chosen_kernel = memset_kernel_##AS<SZ, SetVal, true, false, false>; \
} else { \
if (use_tid) \
chosen_kernel = memset_kernel_##AS<SZ, SetVal, false, true, false>; \
else if (const_setval) \
chosen_kernel = memset_kernel_##AS<SZ, SetVal, false, false, true>; \
else \
chosen_kernel = memset_kernel_##AS<SZ, SetVal, false, false, false>; \
}

switch (AS) {
case AddressSpace::GLOBAL:
SELECT_KERNEL_FOR_ADDRSPACE(global);
break;
case AddressSpace::SHARED:
SELECT_KERNEL_FOR_ADDRSPACE(shared);
break;
case AddressSpace::STACK:
SELECT_KERNEL_FOR_ADDRSPACE(stack);
break;
};
hipLaunchKernelGGL(chosen_kernel, dim3(num_blocks), dim3(block_size), 0, 0,
buf_device, dst_idx, SetVal, SZ);
CHKHIP(hipDeviceSynchronize());

// Fetch the result into buf_host.
CHKHIP(hipMemcpy(buf_host, buf_device, AllocSize, hipMemcpyDeviceToHost));

// Compare to the reference.
bool success = true;
for (size_t i = 0; i < AllocSize; ++i) {
if (buf_host[i] != buf_reference[i]) {
differing_pos = i;
success = false;
break;
}
}

return success;
}

template <size_t SZ, uint8_t SetVal>
int run_tests(uint8_t *buf_reference, uint8_t *buf_host, uint8_t *buf_device,
AddressSpace AS) {
if (AS == AddressSpace::SHARED && SZ > MaxLDSBytesPerThread) {
// LDS is too small for these tests.
return 0;
}
assert(SZ <= MaxBytesPerThread &&
"Increase MaxBytesPerThread for larger sizes");

std::vector<size_t> indexes_to_test = {0, 1, SZ - 1, SZ};
if (SZ > 8) {
indexes_to_test.emplace_back(7);
indexes_to_test.emplace_back(8);
}
if (SZ > 16) {
indexes_to_test.emplace_back(15);
indexes_to_test.emplace_back(16);
}

int nerrs = 0;

size_t differing_pos = 0;
auto test_indexes = [&](bool const_size, bool use_tid, bool const_setval) {
for (const auto &dst_idx : indexes_to_test) {
bool success = run_test<SZ, SetVal>(buf_reference, buf_host, buf_device,
dst_idx, const_size, use_tid,
const_setval, AS, differing_pos);
nerrs += !success;
if (VERBOSE || !success) {
std::cout << "- memsetting [" << dst_idx << ", " << (dst_idx + SZ - 1)
<< "] to ";
if (use_tid) {
std::cout << "the thread id";
} else {
std::cout << static_cast<int>(SetVal) << " ("
<< (const_setval ? "const" : "dynamic") << ")";
}
if (!VERBOSE) {
std::cout << " with " << (const_size ? "static" : "dynamic")
<< " size in " << as_names[AS] << " memory";
}
std::cout << ":";
if (success) {
std::cout << " successful\n";
} else {
std::cout << " failed\n -> first difference at index "
<< differing_pos << '\n';
}
}
}
};

if (VERBOSE)
std::cout << "running tests for dynamic move length " << SZ << " in "
<< as_names[AS] << " memory\n";
test_indexes(false, false, false);
test_indexes(false, false, true);
test_indexes(false, true, false);

// Different paths in codegen are taken if the move length is statically
// known.
if (VERBOSE)
std::cout << "running tests for static move length " << SZ << " in "
<< as_names[AS] << " memory\n";
test_indexes(true, false, false);
test_indexes(true, false, true);
test_indexes(true, true, false);

return nerrs;
}

int main(void) {
uint8_t *buf_device;
CHKHIP(hipMalloc(&buf_device, AllocSize));

std::unique_ptr<uint8_t> buf_host(new uint8_t[AllocSize]);
std::unique_ptr<uint8_t> buf_reference(new uint8_t[AllocSize]);

int nerrs = 0;
for (AddressSpace AS :
{AddressSpace::GLOBAL, AddressSpace::SHARED, AddressSpace::STACK}) {
nerrs += run_tests<64, 0xbb>(buf_reference.get(), buf_host.get(),
buf_device, AS);
nerrs += run_tests<66, 0xbb>(buf_reference.get(), buf_host.get(),
buf_device, AS);
nerrs += run_tests<73, 0xbb>(buf_reference.get(), buf_host.get(),
buf_device, AS);
nerrs +=
run_tests<3, 0xbb>(buf_reference.get(), buf_host.get(), buf_device, AS);
nerrs +=
run_tests<1, 0xbb>(buf_reference.get(), buf_host.get(), buf_device, AS);

// Lengths that are large enough for the IR lowering in the constant
// case, with simple residual, no residual, and maximal residual:
nerrs += run_tests<1025, 0xbb>(buf_reference.get(), buf_host.get(),
buf_device, AS);
nerrs += run_tests<1040, 0xbb>(buf_reference.get(), buf_host.get(),
buf_device, AS);
nerrs += run_tests<1039, 0xbb>(buf_reference.get(), buf_host.get(),
buf_device, AS);
}

CHKHIP(hipFree(buf_device));

if (nerrs != 0) {
std::cout << nerrs << " errors\n";
return 1;
}
std::cout << "PASSED!\n";
return 0;
}
2 changes: 2 additions & 0 deletions External/HIP/memset.reference_output
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PASSED!
exit 0