diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index 5f7158588bc77..7f14004b43ba8 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -63,3 +63,23 @@ def : Function { ]; let returns = []; } + +def : Function { + let name = "olMemFill"; + let desc = "Fill memory with copies of the given pattern"; + let details = [ + "Filling with patterns larger than 4 bytes may be less performant", + "The destination pointer and queue must be associated with the same device", + "The fill size must be a multiple of the pattern size", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>, + Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>, + Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>, + Param<"const void*", "PatternPtr", "", PARAM_IN>, + Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>, + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]> + ]; +} diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 1c9dfc69d445a..707bda9bf6f11 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -656,6 +656,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, return Error::success(); } +Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize, + const void *PatternPtr, size_t FillSize) { + return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize, + Queue->AsyncInfo); +} + Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData, size_t ProgDataSize, ol_program_handle_t *Program) { // Make a copy of the program binary in case it is released by the caller. diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 83280fe0a49c9..964d5e5c263a7 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2583,6 +2583,31 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + hsa_status_t Status; + + // We can use hsa_amd_memory_fill for this size, but it's not async so the + // queue needs to be synchronized first + if (PatternSize == 4) { + if (AsyncInfoWrapper.hasQueue()) + if (auto Err = synchronize(AsyncInfoWrapper)) + return Err; + Status = hsa_amd_memory_fill(TgtPtr, + *static_cast(PatternPtr), + Size / PatternSize); + + if (auto Err = + Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n")) + return Err; + } else { + // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned + // memory and copying to the device in one go. + return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size"); + } + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index a448721755a6f..b2145979ae599 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -957,6 +957,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Fill data on the device with a pattern from the host + Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Run the kernel associated with \p EntryPtr Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index c06c35e1e6a5b..fb672c9782a8b 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, return Err; } +Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Err = + dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 361a781e8f9b6..e8da25bc1d155 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4) DLWRAP(cuMemcpyHtoD, 3) DLWRAP(cuMemcpyHtoDAsync, 4) +DLWRAP(cuMemsetD8Async, 4) +DLWRAP(cuMemsetD16Async, 4) +DLWRAP(cuMemsetD32Async, 4) +DLWRAP(cuMemsetD2D8Async, 6) +DLWRAP(cuMemsetD2D16Async, 6) +DLWRAP(cuMemsetD2D32Async, 6) + DLWRAP(cuMemFree, 1) DLWRAP(cuMemFreeHost, 1) DLWRAP(cuMemFreeAsync, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index b6c022c8e7e8b..93496d95327f3 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -321,6 +321,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); +CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); + CUresult cuMemFree(CUdeviceptr); CUresult cuMemFreeHost(void *); CUresult cuMemFreeAsync(CUdeviceptr, CUstream); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index a99357a3adeaa..c905cfa9e65d6 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -844,6 +844,64 @@ struct CUDADeviceTy : public GenericDeviceTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + CUresult Res; + size_t N = Size / PatternSize; + if (PatternSize == 1) { + Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, + *(static_cast(PatternPtr)), N, + Stream); + } else if (PatternSize == 2) { + Res = cuMemsetD16Async((CUdeviceptr)TgtPtr, + *(static_cast(PatternPtr)), N, + Stream); + } else if (PatternSize == 4) { + Res = cuMemsetD32Async((CUdeviceptr)TgtPtr, + *(static_cast(PatternPtr)), N, + Stream); + } else { + // For larger patterns we can do a series of strided fills to copy the + // pattern efficiently + int64_t MemsetSize = PatternSize % 4u == 0u ? 4u + : PatternSize % 2u == 0u ? 2u + : 1u; + + int64_t NumberOfSteps = PatternSize / MemsetSize; + int64_t Pitch = NumberOfSteps * MemsetSize; + int64_t Height = Size / PatternSize; + + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + if (MemsetSize == 4) { + Res = cuMemsetD2D32Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast(PatternPtr) + Step), 1u, Height, + Stream); + } else if (MemsetSize == 2) { + Res = cuMemsetD2D16Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast(PatternPtr) + Step), 1u, Height, + Stream); + } else { + Res = cuMemsetD2D8Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast(PatternPtr) + Step), 1u, Height, + Stream); + } + } + } + + return Plugin::check(Res, "error in cuMemset: %s"); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 25443fd1ac0b3..35fd45458f893 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -302,6 +302,21 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (PatternSize == 1) { + std::memset(TgtPtr, *static_cast(PatternPtr), Size); + } else { + for (unsigned int Step = 0; Step < Size; Step += PatternSize) { + auto *Dst = static_cast(TgtPtr) + Step; + std::memcpy(Dst, PatternPtr, PatternSize); + } + } + + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo, diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index b25db7022e9d7..58c9b89d1ed0d 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -24,6 +24,7 @@ add_offload_unittest("kernel" add_offload_unittest("memory" memory/olMemAlloc.cpp + memory/olMemFill.cpp memory/olMemFree.cpp memory/olMemcpy.cpp) diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp new file mode 100644 index 0000000000000..1b0bafa202080 --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp @@ -0,0 +1,134 @@ +//===------- Offload API tests - olMemFill --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +using olMemFillTest = OffloadQueueTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest); + +TEST_P(olMemFillTest, Success8) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint8_t Pattern = 0x42; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + uint8_t *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, Success16) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint16_t Pattern = 0x4242; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + uint16_t *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, Success32) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint32_t Pattern = 0xDEADBEEF; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + uint32_t *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLarge) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeByteAligned) { + constexpr size_t Size = 17 * 64; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct __attribute__((packed)) PatternT { + uint64_t A; + uint64_t B; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, InvalidPatternSize) { + constexpr size_t Size = 1025; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint16_t Pattern = 0x4242; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + olMemFree(Alloc); +}