diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 8c150b6bfc2d4..4345276016e59 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -126,6 +126,15 @@ struct KernelLaunchParamsTy { /// Ptrs to the Data entries. Only strictly required for the host plugin. void **Ptrs = nullptr; }; + +/// Rectangular range for rect memcopies. Should be the same layout as +/// liboffload's `ol_memcpy_rect_t`. +struct MemcpyRectTy { + void *Base; + uint32_t Offset[3]; + size_t Pitch; + size_t Slice; +}; } #endif // OMPTARGET_SHARED_API_TYPES_H diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index 79e8038330048..5569c044ddc1e 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -22,7 +22,8 @@ def ol_alloc_type_t : Enum { def olMemAlloc : Function { let desc = "Creates a memory allocation on the specified device."; let details = [ - "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory." + "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory.", + "The returned memory allocation will be aligned at least to a 4 byte boundry.", ]; let params = [ Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>, @@ -113,6 +114,54 @@ def olMemcpy : Function { let returns = []; } +def ol_memcpy_rect_t : Struct { + let desc = "A 3D view into a buffer for `olMemcpyRect`"; + let members = [ + StructMember<"void*", "buffer", "the buffer backing this range">, + StructMember<"ol_dimensions_t", "offset", "byte coordinate offset into the space">, + StructMember<"size_t", "pitch", "the pitch of the buffer in bytes (i.e. how large each `x` row is)">, + StructMember<"size_t", "slice", "the slice of the buffer in bytes (i.e. how large each `pitch * y` plane is)">, + ]; +} + +def olMemcpyRect : Function { + let desc = "Enqueue a 2D or 3D memcpy operation."; + let details = [ + "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.", + "If a queue is specified, at least one device must be a non-host device", + "For both the source and destination, the base pointer, pitch and slice must all be aligned to 4 bytes", + "For 2D copies (where `Size.z` is 1), the slice value is ignored", + "Either the source or destination (or both) must have a non-host device", + "If a queue is not specified, the memcpy happens synchronously", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue.", PARAM_IN_OPTIONAL>, + Param<"ol_memcpy_rect_t", "DstRect", "pointer to copy to", PARAM_IN>, + Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>, + Param<"ol_memcpy_rect_t", "SrcRect", "pointer to copy from", PARAM_IN>, + Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>, + Param<"ol_dimensions_t", "Size", "size in bytes of data to copy", PARAM_IN>, + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", [ + "`DstRect.pitch % 4 > 0`", + "`DstRect.slice % 4 > 0`", + "`(uintptr_t)DstRect.buffer % 4 > 0`", + "`SrcRect.pitch % 4 > 0`", + "`SrcRect.slice % 4 > 0`", + "`(uintptr_t)SrcRect.buffer % 4 > 0`", + "`Size.x == 0 || Size.y == 0 || Size.z == 0`", + ]>, + Return<"OL_ERRC_INVALID_NULL_POINTER", [ + "`DstRect.buffer == NULL`", + "`SrcRect.buffer == NULL`", + ]>, + Return<"OL_ERRC_INVALID_ARGUMENT", [ + "Both arguments are the host device", + ]> + ]; +} + def olMemFill : Function { let desc = "Fill memory with copies of the given pattern"; let details = [ diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 6d22faeb0e57e..dd73344427d40 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -1001,6 +1001,36 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, return Error::success(); } +Error olMemcpyRect_impl(ol_queue_handle_t Queue, ol_memcpy_rect_t DstRect, + ol_device_handle_t DstDevice, ol_memcpy_rect_t SrcRect, + ol_device_handle_t SrcDevice, ol_dimensions_t Size) { + auto Host = OffloadContext::get().HostDevice; + if (DstDevice == Host && SrcDevice == Host) { + return createOffloadError( + ErrorCode::INVALID_ARGUMENT, + "one of DstDevice and SrcDevice must be a non-host device"); + } + + // If no queue is given the memcpy will be synchronous + auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr; + + static_assert(sizeof(ol_memcpy_rect_t) == sizeof(MemcpyRectTy)); + auto AsPIDst = bit_cast(DstRect); + auto AsPISrc = bit_cast(SrcRect); + uint32_t AsPISize[3] = {Size.x, Size.y, Size.z}; + + if (DstDevice == Host) + return SrcDevice->Device->dataRetrieveRect(AsPIDst, AsPISrc, AsPISize, + QueueImpl); + + if (SrcDevice == Host) + return DstDevice->Device->dataSubmitRect(AsPIDst, AsPISrc, AsPISize, + QueueImpl); + + return DstDevice->Device->dataExchangeRect(AsPISrc, *DstDevice->Device, + AsPIDst, AsPISize, QueueImpl); +} + Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize, const void *PatternPtr, size_t FillSize) { return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize, diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp index bc92f4a46a5c0..471aee954b7ab 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp @@ -59,6 +59,7 @@ DLWRAP(hsa_amd_agent_iterate_memory_pools, 3) DLWRAP(hsa_amd_memory_pool_allocate, 4) DLWRAP(hsa_amd_memory_pool_free, 1) DLWRAP(hsa_amd_memory_async_copy, 8) +DLWRAP(hsa_amd_memory_async_copy_rect, 10) DLWRAP(hsa_amd_memory_pool_get_info, 3) DLWRAP(hsa_amd_agents_allow_access, 4) DLWRAP(hsa_amd_memory_lock, 5) diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 29cfe78082dbb..71bc8512f2f41 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -96,6 +96,26 @@ hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent, const hsa_signal_t *dep_signals, hsa_signal_t completion_signal); +enum hsa_amd_copy_direction_t { + hsaHostToHost = 0, + hsaHostToDevice = 1, + hsaDeviceToHost = 2, + hsaDeviceToDevice = 3, +}; + +typedef struct hsa_pitched_ptr_s { + void *base; + size_t pitch; + size_t slice; +} hsa_pitched_ptr_t; + +hsa_status_t hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset, + const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset, + const hsa_dim3_t *range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, + const hsa_signal_t *dep_signals, hsa_signal_t completion_signal); + hsa_status_t hsa_amd_agent_memory_pool_get_info( hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, hsa_amd_agent_memory_pool_info_t attribute, void *value); diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index a7723b8598815..20d44937dd9dd 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -197,6 +197,23 @@ static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, #endif } +/// Dispatches an asynchronous 3D/2D memory copy. +static Error asyncMemCopyRect(MemcpyRectTy Dst, MemcpyRectTy Src, + hsa_agent_t Agent, hsa_amd_copy_direction_t Dir, + uint32_t Size[3], uint32_t NumDepSignals, + const hsa_signal_t *DepSignals, + hsa_signal_t CompletionSignal) { + hsa_pitched_ptr_t SrcPitched{Src.Base, Src.Pitch, Src.Slice}; + hsa_pitched_ptr_t DstPitched{Dst.Base, Dst.Pitch, Dst.Slice}; + + hsa_status_t S = hsa_amd_memory_async_copy_rect( + &DstPitched, reinterpret_cast(Dst.Offset), &SrcPitched, + reinterpret_cast(Src.Offset), + reinterpret_cast(Size), Agent, Dir, NumDepSignals, + DepSignals, CompletionSignal); + return Plugin::check(S, "error in hsa_amd_memory_async_copy_rect: %s"); +} + static Error getTargetTripleAndFeatures(hsa_agent_t Agent, SmallVector> &Targets) { auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { @@ -1366,6 +1383,33 @@ struct AMDGPUStreamTy { OutputSignal->get()); } + /// Push an asynchronous 2D or 3D memory copy between pinned memory buffers. + Error pushPinnedMemoryCopyRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src, + uint32_t CopySize[3], + hsa_amd_copy_direction_t Dir) { + // Retrieve an available signal for the operation's output. + AMDGPUSignalTy *OutputSignal = nullptr; + if (auto Err = SignalManager.getResource(OutputSignal)) + return Err; + OutputSignal->reset(); + OutputSignal->increaseUseCount(); + + std::lock_guard Lock(Mutex); + + // Consume stream slot and compute dependencies. + auto [Curr, InputSignal] = consume(OutputSignal); + + // Issue the async memory copy. + if (InputSignal && InputSignal->load()) { + hsa_signal_t InputSignalRaw = InputSignal->get(); + return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 1, + &InputSignalRaw, OutputSignal->get()); + } + + return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 0, + nullptr, OutputSignal->get()); + } + /// Push an asynchronous memory copy device-to-host involving an unpinned /// memory buffer. The operation consists of a two-step copy from the /// device buffer to an intermediate pinned host buffer, and then, to a @@ -1540,6 +1584,37 @@ struct AMDGPUStreamTy { OutputSignal->get()); } + Error pushMemoryCopyD2DRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src, + hsa_agent_t Agent, uint32_t CopySize[3]) { + AMDGPUSignalTy *OutputSignal; + if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) + return Err; + OutputSignal->reset(); + OutputSignal->increaseUseCount(); + + std::lock_guard Lock(Mutex); + + // Consume stream slot and compute dependencies. + auto [Curr, InputSignal] = consume(OutputSignal); + + // The agents need to have access to the corresponding memory + // This is presently only true if the pointers were originally + // allocated by this runtime or the caller made the appropriate + // access calls. + + // TODO: Cross device transfers might not work + + if (InputSignal && InputSignal->load()) { + hsa_signal_t InputSignalRaw = InputSignal->get(); + return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice, + CopySize, 1, &InputSignalRaw, + OutputSignal->get()); + } + return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice, + CopySize, 0, nullptr, + OutputSignal->get()); + } + Error pushHostCallback(void (*Callback)(void *), void *UserData) { // Retrieve an available signal for the operation's output. AMDGPUSignalTy *OutputSignal = nullptr; @@ -2462,6 +2537,38 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return true; } +private: + template + Error syncTransfer(AsyncInfoWrapperTy &AsyncInfoWrapper, void *HostPtr, + size_t Size, FTy Op) { + if (AsyncInfoWrapper.hasQueue()) + if (auto Err = synchronize(AsyncInfoWrapper)) + return Err; + + void *PinnedPtr = nullptr; + hsa_status_t Status; + Status = hsa_amd_memory_lock(HostPtr, Size, nullptr, 0, &PinnedPtr); + if (auto Err = Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n")) + return Err; + + AMDGPUSignalTy Signal; + if (auto Err = Signal.init()) + return Err; + + if (auto Err = Op(PinnedPtr, Signal)) + return Err; + + if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) + return Err; + + if (auto Err = Signal.deinit()) + return Err; + + Status = hsa_amd_memory_unlock(HostPtr); + return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n"); + } + +public: /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { @@ -2478,34 +2585,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // For large transfers use synchronous behavior. if (Size >= OMPX_MaxAsyncCopyBytes) { - if (AsyncInfoWrapper.hasQueue()) - if (auto Err = synchronize(AsyncInfoWrapper)) - return Err; - - hsa_status_t Status; - Status = hsa_amd_memory_lock(const_cast(HstPtr), Size, nullptr, 0, - &PinnedPtr); - if (auto Err = - Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n")) - return Err; - - AMDGPUSignalTy Signal; - if (auto Err = Signal.init()) - return Err; - - if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, - Agent, PinnedPtr, Agent, Size, 0, - nullptr, Signal.get())) - return Err; - - if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) - return Err; - - if (auto Err = Signal.deinit()) - return Err; - - Status = hsa_amd_memory_unlock(const_cast(HstPtr)); - return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n"); + return syncTransfer(AsyncInfoWrapper, const_cast(HstPtr), Size, + [&](void *PinnedPtr, AMDGPUSignalTy &Signal) { + return hsa_utils::asyncMemCopy( + useMultipleSdmaEngines(), TgtPtr, Agent, + PinnedPtr, Agent, Size, 0, nullptr, + Signal.get()); + }); } // Otherwise, use two-step copy with an intermediate pinned host buffer. @@ -2521,6 +2607,34 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { PinnedMemoryManager); } + /// 2D/3D host to device transfer. + Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + AMDGPUStreamTy *Stream = nullptr; + + // Use one-step asynchronous operation when host memory is already pinned. + if (void *PinnedPtr = + PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) { + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + HstRect.Base = PinnedPtr; + return Stream->pushPinnedMemoryCopyRectAsync(TgtRect, HstRect, Size, + hsaHostToDevice); + } + + auto BufferSize = HstRect.Slice * (HstRect.Offset[2] + Size[2]); + return syncTransfer(AsyncInfoWrapper, const_cast(HstRect.Base), + BufferSize, + [&](void *PinnedPtr, AMDGPUSignalTy &Signal) { + HstRect.Base = PinnedPtr; + return hsa_utils::asyncMemCopyRect( + TgtRect, HstRect, Agent, hsaHostToDevice, Size, 0, + nullptr, Signal.get()); + }); + } + /// Retrieve data from the device (device to host transfer). Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { @@ -2538,34 +2652,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // For large transfers use synchronous behavior. if (Size >= OMPX_MaxAsyncCopyBytes) { - if (AsyncInfoWrapper.hasQueue()) - if (auto Err = synchronize(AsyncInfoWrapper)) - return Err; - - hsa_status_t Status; - Status = hsa_amd_memory_lock(const_cast(HstPtr), Size, nullptr, 0, - &PinnedPtr); - if (auto Err = - Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n")) - return Err; - - AMDGPUSignalTy Signal; - if (auto Err = Signal.init()) - return Err; - - if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), - PinnedPtr, Agent, TgtPtr, Agent, - Size, 0, nullptr, Signal.get())) - return Err; - - if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) - return Err; - - if (auto Err = Signal.deinit()) - return Err; - - Status = hsa_amd_memory_unlock(const_cast(HstPtr)); - return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n"); + return syncTransfer(AsyncInfoWrapper, const_cast(HstPtr), Size, + [&](void *PinnedPtr, AMDGPUSignalTy &Signal) { + return hsa_utils::asyncMemCopy( + useMultipleSdmaEngines(), PinnedPtr, Agent, + TgtPtr, Agent, Size, 0, nullptr, Signal.get()); + }); } // Otherwise, use two-step copy with an intermediate pinned host buffer. @@ -2581,6 +2673,33 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { PinnedMemoryManager); } + /// 2D/3D device to host transfer. + Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + AMDGPUStreamTy *Stream = nullptr; + + if (void *PinnedPtr = + PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) { + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + HstRect.Base = PinnedPtr; + return Stream->pushPinnedMemoryCopyRectAsync(HstRect, TgtRect, Size, + hsaDeviceToHost); + } + + auto BufferSize = HstRect.Slice * (HstRect.Offset[2] + Size[2]); + return syncTransfer(AsyncInfoWrapper, const_cast(HstRect.Base), + BufferSize, + [&](void *PinnedPtr, AMDGPUSignalTy &Signal) { + HstRect.Base = PinnedPtr; + return hsa_utils::asyncMemCopyRect( + HstRect, TgtRect, Agent, hsaDeviceToHost, Size, 0, + nullptr, Signal.get()); + }); + } + /// Exchange data between two devices within the plugin. Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, void *DstPtr, int64_t Size, @@ -2618,6 +2737,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } + /// 2D/3D device to device transfer. + Error dataExchangeRectImpl(MemcpyRectTy SrcRect, + GenericDeviceTy &DstGenericDevice, + MemcpyRectTy DstRect, uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + AMDGPUDeviceTy &DstDevice = static_cast(DstGenericDevice); + AMDGPUStreamTy *Stream = nullptr; + + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + return Stream->pushMemoryCopyD2DRectAsync(DstRect, SrcRect, + DstDevice.getAgent(), Size); + } + /// Insert a data fence between previous data operations and the following /// operations. This is a no-op for AMDGPU devices as operations inserted into /// a queue are in-order. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 8c530bba3882c..d42aaf09875af 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -921,12 +921,25 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + Error dataSubmitRect(MemcpyRectTy TgtRect, const MemcpyRectTy HstRect, + uint32_t Size[3], __tgt_async_info *AsyncInfo); + virtual Error dataSubmitRectImpl(const MemcpyRectTy TgtRect, + MemcpyRectTy HstRect, uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Retrieve data from the device (device to host transfer). Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, __tgt_async_info *AsyncInfo); virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + Error dataRetrieveRect(MemcpyRectTy HstRect, const MemcpyRectTy TgtRect, + uint32_t Size[3], __tgt_async_info *AsyncInfo); + virtual Error dataRetrieveRectImpl(MemcpyRectTy HstRect, + const MemcpyRectTy TgtRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Instert a data fence between previous data operations and the following /// operations if necessary for the device virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0; @@ -940,6 +953,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + Error dataExchangeRect(MemcpyRectTy SrcRect, GenericDeviceTy &DstDev, + const MemcpyRectTy DstRect, uint32_t Size[3], + __tgt_async_info *AsyncInfo); + virtual Error dataExchangeRectImpl(MemcpyRectTy SrcRect, + GenericDeviceTy &DstDev, + const MemcpyRectTy DstRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Fill data on the device with a pattern from the host Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, int64_t Size, __tgt_async_info *AsyncInfo); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index db43cbe49cc2b..6eda231e99ef7 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1437,6 +1437,17 @@ Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, return Err; } +Error GenericDeviceTy::dataSubmitRect(MemcpyRectTy TgtRect, + const MemcpyRectTy HstRect, + uint32_t Size[3], + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + + auto Err = dataSubmitRectImpl(TgtRect, HstRect, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, __tgt_async_info *AsyncInfo) { AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); @@ -1446,6 +1457,17 @@ Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr, return Err; } +Error GenericDeviceTy::dataRetrieveRect(MemcpyRectTy HstRect, + const MemcpyRectTy TgtRect, + uint32_t Size[3], + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + + auto Err = dataRetrieveRectImpl(HstRect, TgtRect, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { @@ -1456,6 +1478,19 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, return Err; } +Error GenericDeviceTy::dataExchangeRect(MemcpyRectTy SrcRect, + GenericDeviceTy &DstDev, + const MemcpyRectTy DstRect, + uint32_t Size[3], + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + + auto Err = + dataExchangeRectImpl(SrcRect, DstDev, DstRect, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, int64_t Size, __tgt_async_info *AsyncInfo) { diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index db94f7f2dd995..3ca4c453e424a 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -803,6 +803,13 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::check(Res, "error in cuMemcpyHtoDAsync: %s"); } + Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "CUDA does not yet support 2D/3D copies"); + } + /// Retrieve data from the device (device to host transfer). Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { @@ -817,12 +824,26 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::check(Res, "error in cuMemcpyDtoHAsync: %s"); } + Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "CUDA does not yet support 2D/3D copies"); + } + /// Exchange data between two devices directly. We may use peer access if /// the CUDA devices and driver allow them. Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataExchangeRectImpl(MemcpyRectTy SrcRect, GenericDeviceTy &Dst, + MemcpyRectTy DstRect, uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "CUDA does not yet support 2D/3D copies"); + } + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index eb4ecac9907a1..101c58fcd093c 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -285,6 +285,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "Host does not yet support 2D/3D copies"); + } + /// Retrieve data from the device (device to host transfer). Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { @@ -292,6 +299,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect, + uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "Host does not yet support 2D/3D copies"); + } + /// Exchange data between two devices within the plugin. This function is not /// supported in this plugin. Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, @@ -303,6 +317,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "dataExchangeImpl not supported"); } + Error dataExchangeRectImpl(MemcpyRectTy SrcRect, GenericDeviceTy &Dst, + MemcpyRectTy DstRect, uint32_t Size[3], + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "Host does not yet support 2D/3D copies"); + } + /// Insert a data fence between previous data operations and the following /// operations. This is a no-op for Host devices as operations inserted into /// a queue are in-order. diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index 50c99a5d5b639..2ebe2cb79dcd0 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -28,6 +28,7 @@ add_offload_unittest("memory" memory/olMemFill.cpp memory/olMemFree.cpp memory/olMemcpy.cpp + memory/olMemcpyRect.cpp memory/olGetMemInfo.cpp memory/olGetMemInfoSize.cpp) diff --git a/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp b/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp new file mode 100644 index 0000000000000..24dfee20ca48a --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp @@ -0,0 +1,446 @@ +//===------- Offload API tests - olMemcpyRect ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +constexpr ol_dimensions_t FULL_SIZE = {16, 8, 4}; +constexpr size_t BYTES = FULL_SIZE.x * FULL_SIZE.y * FULL_SIZE.z; + +constexpr ol_dimensions_t COPY_SIZE = {4, 3, 2}; +constexpr ol_dimensions_t COPY_OFFSET = {8, 2, 1}; + +struct olMemcpyRectTest : OffloadQueueTest { + void SetUp() override { + RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp()); + + ol_platform_handle_t Platform; + ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, + sizeof(Platform), &Platform)); + ol_platform_backend_t Backend; + ASSERT_SUCCESS(olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, + sizeof(Backend), &Backend)); + if (Backend == OL_PLATFORM_BACKEND_CUDA) + GTEST_SKIP() << "CUDA does not yet support this entry point\n"; + + Buff.fill('h'); + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, BYTES, &HostPtr)); + ASSERT_SUCCESS( + olMemcpy(nullptr, HostPtr, Device, Buff.data(), Host, BYTES)); + + Buff.fill('d'); + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, BYTES, &DevicePtr)); + ASSERT_SUCCESS( + olMemcpy(nullptr, DevicePtr, Device, Buff.data(), Host, BYTES)); + + Buff.fill('D'); + ASSERT_SUCCESS( + olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, BYTES, &DevicePtr2)); + ASSERT_SUCCESS( + olMemcpy(nullptr, DevicePtr2, Device, Buff.data(), Host, BYTES)); + + Buff.fill('u'); + + SrcRect.offset = DstRect.offset = COPY_OFFSET; + SrcRect.pitch = DstRect.pitch = FULL_SIZE.x; + SrcRect.slice = DstRect.slice = FULL_SIZE.y * FULL_SIZE.x; + } + + void TearDown() override { + if (HostPtr) + ASSERT_SUCCESS(olMemFree(HostPtr)); + if (DevicePtr) + ASSERT_SUCCESS(olMemFree(DevicePtr)); + if (DevicePtr2) + ASSERT_SUCCESS(olMemFree(DevicePtr2)); + } + + void checkPattern(void *CheckBuffer, const char *Template) { + ASSERT_SUCCESS( + olMemcpy(nullptr, Buff.data(), Host, CheckBuffer, Device, BYTES)); + bool Failed = false; + + for (size_t I = 0; I < BYTES; I++) { + if (Buff[I] != Template[I]) { + ADD_FAILURE() << "Failure at location " << I << "\n"; + Failed = true; + break; + } + } + + if (Failed) { + std::cerr << "Expected:\n"; + printSlices([&](size_t I) -> char { return Template[I]; }); + std::cerr << "Got:\n"; + printSlices([&](size_t I) -> char { return Buff[I]; }); + std::cerr << "Delta:\n"; + printSlices( + [&](size_t I) -> char { return Buff[I] == Template[I] ? '.' : 'X'; }); + } + } + + template void printSlices(F Getter) { + for (size_t Y = 0; Y < FULL_SIZE.y; Y++) { + for (size_t Z = 0; Z < FULL_SIZE.z; Z++) { + for (size_t X = 0; X < FULL_SIZE.x; X++) { + std::cerr << Getter(X + (Y * FULL_SIZE.x) + + (Z * FULL_SIZE.y * FULL_SIZE.x)); + } + std::cerr << " "; + } + + std::cerr << "\n"; + } + } + + std::array Buff; + void *HostPtr; + void *DevicePtr; + void *DevicePtr2; + ol_memcpy_rect_t SrcRect; + ol_memcpy_rect_t DstRect; +}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemcpyRectTest); + +TEST_P(olMemcpyRectTest, SuccessHtoD) { + DstRect.buffer = DevicePtr; + SrcRect.buffer = HostPtr; + + ASSERT_SUCCESS( + olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE)); + ASSERT_SUCCESS(olSyncQueue(Queue)); + + // clang-format off + checkPattern(DevicePtr, + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "ddddddddhhhhdddd" + "ddddddddhhhhdddd" + "ddddddddhhhhdddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "ddddddddhhhhdddd" + "ddddddddhhhhdddd" + "ddddddddhhhhdddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + ); + // clang-format on +} + +TEST_P(olMemcpyRectTest, SuccessUtoD) { + DstRect.buffer = DevicePtr; + SrcRect.buffer = Buff.data(); + + ASSERT_SUCCESS( + olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE)); + ASSERT_SUCCESS(olSyncQueue(Queue)); + + // clang-format off + checkPattern(DevicePtr, + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddduuuudddd" + "dddddddduuuudddd" + "dddddddduuuudddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddduuuudddd" + "dddddddduuuudddd" + "dddddddduuuudddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + ); + // clang-format on +} + +TEST_P(olMemcpyRectTest, SuccessDtoH) { + DstRect.buffer = HostPtr; + SrcRect.buffer = DevicePtr; + + ASSERT_SUCCESS( + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); + ASSERT_SUCCESS(olSyncQueue(Queue)); + + // clang-format off + checkPattern(HostPtr, + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhddddhhhh" + "hhhhhhhhddddhhhh" + "hhhhhhhhddddhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhddddhhhh" + "hhhhhhhhddddhhhh" + "hhhhhhhhddddhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + "hhhhhhhhhhhhhhhh" + ); + // clang-format on +} + +TEST_P(olMemcpyRectTest, SuccessDtoU) { + DstRect.buffer = Buff.data(); + SrcRect.buffer = DevicePtr; + + ASSERT_SUCCESS( + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); + ASSERT_SUCCESS(olSyncQueue(Queue)); + + // clang-format off + checkPattern(Buff.data(), + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuudddduuuu" + "uuuuuuuudddduuuu" + "uuuuuuuudddduuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuudddduuuu" + "uuuuuuuudddduuuu" + "uuuuuuuudddduuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + "uuuuuuuuuuuuuuuu" + ); + // clang-format on +} + +TEST_P(olMemcpyRectTest, SuccessDtoD) { + DstRect.buffer = DevicePtr; + SrcRect.buffer = DevicePtr2; + + ASSERT_SUCCESS( + olMemcpyRect(Queue, DstRect, Device, SrcRect, Device, COPY_SIZE)); + ASSERT_SUCCESS(olSyncQueue(Queue)); + + // clang-format off + checkPattern(DevicePtr, + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "ddddddddDDDDdddd" + "ddddddddDDDDdddd" + "ddddddddDDDDdddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "ddddddddDDDDdddd" + "ddddddddDDDDdddd" + "ddddddddDDDDdddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + "dddddddddddddddd" + ); + // clang-format on +} + +TEST_P(olMemcpyRectTest, InvalidDstPtr) { + DstRect.buffer = nullptr; + SrcRect.buffer = HostPtr; + + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidSrcPtr) { + DstRect.buffer = HostPtr; + SrcRect.buffer = nullptr; + + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidDstDevice) { + DstRect.buffer = HostPtr; + SrcRect.buffer = DevicePtr; + + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olMemcpyRect(Queue, DstRect, nullptr, SrcRect, Host, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidSrcDevice) { + DstRect.buffer = HostPtr; + SrcRect.buffer = DevicePtr; + + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, nullptr, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidSize) { + DstRect.buffer = HostPtr; + SrcRect.buffer = DevicePtr; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, {0, 0, 0})); +} + +TEST_P(olMemcpyRectTest, InvalidSrcPtrAlign) { + DstRect.buffer = HostPtr; + SrcRect.buffer = &static_cast(DevicePtr)[2]; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidDstPtrAlign) { + DstRect.buffer = &static_cast(HostPtr)[2]; + SrcRect.buffer = DevicePtr; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidDstPitchAlign) { + DstRect.buffer = HostPtr; + DstRect.pitch = 2; + SrcRect.buffer = DevicePtr; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidSrcPitchAlign) { + DstRect.buffer = HostPtr; + SrcRect.buffer = DevicePtr; + SrcRect.pitch = 2; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidDstSliceAlign) { + DstRect.buffer = HostPtr; + DstRect.slice = 2; + SrcRect.buffer = DevicePtr; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); +} + +TEST_P(olMemcpyRectTest, InvalidSrcSliceAlign) { + DstRect.buffer = HostPtr; + SrcRect.buffer = DevicePtr; + SrcRect.slice = 2; + + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE)); +}