Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions offload/include/Shared/APITypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,15 @@ struct KernelLaunchParamsTy {
/// Ptrs to the Data entries. Only strictly required for the host plugin.
void **Ptrs = nullptr;
};

/// Rectangular range for rect memcopies. Should be the same layout as
/// liboffload's `ol_memcpy_rect_t`.
struct MemcpyRectTy {
void *Base;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initialize them, like those structs above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like that would make MemcpyRectTy non-trivially-copyable, which means it can't be used in bit_cast.

uint32_t Offset[3];
size_t Pitch;
size_t Slice;
};
}

#endif // OMPTARGET_SHARED_API_TYPES_H
51 changes: 50 additions & 1 deletion offload/liboffload/API/Memory.td
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def ol_alloc_type_t : Enum {
def olMemAlloc : Function {
let desc = "Creates a memory allocation on the specified device.";
let details = [
"All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory."
"All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory.",
"The returned memory allocation will be aligned at least to a 4 byte boundry.",
];
let params = [
Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
Expand Down Expand Up @@ -113,6 +114,54 @@ def olMemcpy : Function {
let returns = [];
}

def ol_memcpy_rect_t : Struct {
let desc = "A 3D view into a buffer for `olMemcpyRect`";
let members = [
StructMember<"void*", "buffer", "the buffer backing this range">,
StructMember<"ol_dimensions_t", "offset", "byte coordinate offset into the space">,
StructMember<"size_t", "pitch", "the pitch of the buffer in bytes (i.e. how large each `x` row is)">,
StructMember<"size_t", "slice", "the slice of the buffer in bytes (i.e. how large each `pitch * y` plane is)">,
];
}

def olMemcpyRect : Function {
let desc = "Enqueue a 2D or 3D memcpy operation.";
let details = [
"For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
"If a queue is specified, at least one device must be a non-host device",
"For both the source and destination, the base pointer, pitch and slice must all be aligned to 4 bytes",
"For 2D copies (where `Size.z` is 1), the slice value is ignored",
"Either the source or destination (or both) must have a non-host device",
"If a queue is not specified, the memcpy happens synchronously",
];
let params = [
Param<"ol_queue_handle_t", "Queue", "handle of the queue.", PARAM_IN_OPTIONAL>,
Param<"ol_memcpy_rect_t", "DstRect", "pointer to copy to", PARAM_IN>,
Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>,
Param<"ol_memcpy_rect_t", "SrcRect", "pointer to copy from", PARAM_IN>,
Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>,
Param<"ol_dimensions_t", "Size", "size in bytes of data to copy", PARAM_IN>,
];
let returns = [
Return<"OL_ERRC_INVALID_SIZE", [
"`DstRect.pitch % 4 > 0`",
"`DstRect.slice % 4 > 0`",
"`(uintptr_t)DstRect.buffer % 4 > 0`",
"`SrcRect.pitch % 4 > 0`",
"`SrcRect.slice % 4 > 0`",
"`(uintptr_t)SrcRect.buffer % 4 > 0`",
"`Size.x == 0 || Size.y == 0 || Size.z == 0`",
]>,
Return<"OL_ERRC_INVALID_NULL_POINTER", [
"`DstRect.buffer == NULL`",
"`SrcRect.buffer == NULL`",
]>,
Return<"OL_ERRC_INVALID_ARGUMENT", [
"Both arguments are the host device",
]>
];
}

def olMemFill : Function {
let desc = "Fill memory with copies of the given pattern";
let details = [
Expand Down
30 changes: 30 additions & 0 deletions offload/liboffload/src/OffloadImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,36 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
return Error::success();
}

Error olMemcpyRect_impl(ol_queue_handle_t Queue, ol_memcpy_rect_t DstRect,
ol_device_handle_t DstDevice, ol_memcpy_rect_t SrcRect,
ol_device_handle_t SrcDevice, ol_dimensions_t Size) {
auto Host = OffloadContext::get().HostDevice;
if (DstDevice == Host && SrcDevice == Host) {
return createOffloadError(
ErrorCode::INVALID_ARGUMENT,
"one of DstDevice and SrcDevice must be a non-host device");
}

// If no queue is given the memcpy will be synchronous
auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;

static_assert(sizeof(ol_memcpy_rect_t) == sizeof(MemcpyRectTy));
auto AsPIDst = bit_cast<MemcpyRectTy>(DstRect);
auto AsPISrc = bit_cast<MemcpyRectTy>(SrcRect);
uint32_t AsPISize[3] = {Size.x, Size.y, Size.z};

if (DstDevice == Host)
return SrcDevice->Device->dataRetrieveRect(AsPIDst, AsPISrc, AsPISize,
QueueImpl);

if (SrcDevice == Host)
return DstDevice->Device->dataSubmitRect(AsPIDst, AsPISrc, AsPISize,
QueueImpl);

return DstDevice->Device->dataExchangeRect(AsPISrc, *DstDevice->Device,
AsPIDst, AsPISize, QueueImpl);
}

Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
const void *PatternPtr, size_t FillSize) {
return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
DLWRAP(hsa_amd_memory_pool_allocate, 4)
DLWRAP(hsa_amd_memory_pool_free, 1)
DLWRAP(hsa_amd_memory_async_copy, 8)
DLWRAP(hsa_amd_memory_async_copy_rect, 10)
DLWRAP(hsa_amd_memory_pool_get_info, 3)
DLWRAP(hsa_amd_agents_allow_access, 4)
DLWRAP(hsa_amd_memory_lock, 5)
Expand Down
20 changes: 20 additions & 0 deletions offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
const hsa_signal_t *dep_signals,
hsa_signal_t completion_signal);

enum hsa_amd_copy_direction_t {
hsaHostToHost = 0,
hsaHostToDevice = 1,
hsaDeviceToHost = 2,
hsaDeviceToDevice = 3,
};

typedef struct hsa_pitched_ptr_s {
void *base;
size_t pitch;
size_t slice;
} hsa_pitched_ptr_t;

hsa_status_t hsa_amd_memory_async_copy_rect(
const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
const hsa_dim3_t *range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);

hsa_status_t hsa_amd_agent_memory_pool_get_info(
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
hsa_amd_agent_memory_pool_info_t attribute, void *value);
Expand Down
Loading
Loading