Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions opal/mca/accelerator/accelerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,29 @@ typedef int (*opal_accelerator_base_module_memcpy_fn_t)(
int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
opal_accelerator_transfer_type_t type);

/**
* Copies a matrix of memory (height rows of width bytes) synchronously
* from src to dest. Memory of dest and src may not overlap. Optionally
* can specify the transfer type to avoid pointer detection for
* performance.
*
* @param[IN] dev_id Associated device to copy to/from
* @param[IN] dest Destination to copy memory to
* @param[IN] dpitch Pitch of destination memory
* @param[IN] src Source to copy memory from
* @param[IN] spitch Pitch of source memory
* @param[IN] width Width of matrix transfer (columns in bytes)
* @param[IN] height Height of matrix transfer (rows)
* @param[IN] type Transfer type field for performance
* Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC
* if caller is unsure of the transfer direction.
*
* @return OPAL_SUCCESS or error status on failure
*/
typedef int (*opal_accelerator_base_module_matrix_memcpy_fn_t)(
int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, const void *src, size_t spitch,
size_t width, size_t height, opal_accelerator_transfer_type_t type);

/**
* Copies memory synchronously from src to dest. Memory of dest and src
* may overlap. Optionally can specify the transfer type to
Expand Down Expand Up @@ -373,6 +396,7 @@ typedef struct {

opal_accelerator_base_module_memcpy_async_fn_t memcpy_async;
opal_accelerator_base_module_memcpy_fn_t memcpy;
opal_accelerator_base_module_matrix_memcpy_fn_t matrix_memcpy;
opal_accelerator_base_module_memmove_fn_t memmove;

opal_accelerator_base_module_malloc_fn_t malloc;
Expand Down
111 changes: 111 additions & 0 deletions opal/mca/accelerator/cuda/accelerator_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
size_t size, opal_accelerator_transfer_type_t type);
static int accelerator_cuda_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height,
opal_accelerator_transfer_type_t type);
static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
opal_accelerator_transfer_type_t type);
static int accelerator_cuda_malloc(int dev_id, void **ptr, size_t size);
Expand All @@ -59,6 +63,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =

accelerator_cuda_memcpy_async,
accelerator_cuda_memcpy,
accelerator_cuda_matrix_memcpy,
accelerator_cuda_memmove,
accelerator_cuda_malloc,
accelerator_cuda_free,
Expand Down Expand Up @@ -381,6 +386,112 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
return OPAL_SUCCESS;
}

static int accelerator_cuda_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height,
opal_accelerator_transfer_type_t type)
{
CUDA_MEMCPY2D copy = {0};
CUmemorytype src_type, dest_type;
CUresult result;

if (NULL == dest || NULL == src || width <= 0 || height <= 0) {
return OPAL_ERR_BAD_PARAM;
}

switch (type) {
case MCA_ACCELERATOR_TRANSFER_HTOH:
{
src_type = CU_MEMORYTYPE_HOST;
dest_type = CU_MEMORYTYPE_HOST;
break;
}
case MCA_ACCELERATOR_TRANSFER_HTOD:
{
src_type = CU_MEMORYTYPE_HOST;
dest_type = CU_MEMORYTYPE_DEVICE;
break;
}
case MCA_ACCELERATOR_TRANSFER_DTOH:
{
src_type = CU_MEMORYTYPE_DEVICE;
dest_type = CU_MEMORYTYPE_HOST;
break;
}
case MCA_ACCELERATOR_TRANSFER_DTOD:
{
src_type = CU_MEMORYTYPE_DEVICE;
dest_type = CU_MEMORYTYPE_DEVICE;
break;
}
default:
result = opal_accelerator_cuda_func.cuPointerGetAttribute(&src_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) src);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
return OPAL_ERROR;
}
result = opal_accelerator_cuda_func.cuPointerGetAttribute(&dest_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) dest);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
return OPAL_ERROR;
}
}

switch (src_type) {
case (CU_MEMORYTYPE_HOST):
{
copy.srcHost = src;
break;
}
case (CU_MEMORYTYPE_DEVICE):
{
copy.srcDevice = (CUdeviceptr) src;
break;
}
case (CU_MEMORYTYPE_UNIFIED):
{
copy.srcDevice = (CUdeviceptr) src;
break;
}
default:
opal_output(0, "CUDA: cuMemcpy2D failed: Unhandled memory type");
return OPAL_ERROR;
}

switch (dest_type) {
case (CU_MEMORYTYPE_HOST):
{
copy.dstHost = dest;
break;
}
case (CU_MEMORYTYPE_DEVICE):
{
copy.dstDevice = (CUdeviceptr) dest;
break;
}
case (CU_MEMORYTYPE_UNIFIED):
{
copy.dstDevice = (CUdeviceptr) dest;
break;
}
default:
opal_output(0, "CUDA: cuMemcpy2D failed: Unhandled memory type");
return OPAL_ERROR;
}

copy.srcMemoryType = src_type;
copy.srcPitch = spitch;
copy.dstMemoryType = dest_type;
copy.dstPitch = dpitch;
copy.WidthInBytes = width;
copy.Height = height;
result = opal_accelerator_cuda_func.cuMemcpy2D(&copy);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuMemcpy2D failed", true,
OPAL_PROC_MY_HOSTNAME, result);
return result;
}
return OPAL_SUCCESS;
}

static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
opal_accelerator_transfer_type_t type)
{
Expand Down
13 changes: 13 additions & 0 deletions opal/mca/accelerator/null/accelerator_null_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void *
opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
size_t size, opal_accelerator_transfer_type_t type);
static int accelerator_null_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height,
opal_accelerator_transfer_type_t type);
static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
opal_accelerator_transfer_type_t type);

Expand Down Expand Up @@ -111,6 +115,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module =

accelerator_null_memcpy_async,
accelerator_null_memcpy,
accelerator_null_matrix_memcpy,
accelerator_null_memmove,
accelerator_null_malloc,
accelerator_null_free,
Expand Down Expand Up @@ -191,6 +196,14 @@ static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest,
return OPAL_SUCCESS;
}

static int accelerator_null_matrix_memcpy(void *dest, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height,
opal_accelerator_transfer_type_t type)
{
return OPAL_ERR_NOT_IMPLEMENTED;
}

static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
opal_accelerator_transfer_type_t type)
{
Expand Down
45 changes: 45 additions & 0 deletions opal/mca/accelerator/rocm/accelerator_rocm_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo
opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
size_t size, opal_accelerator_transfer_type_t type);
static int mca_accelerator_rocm_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height,
opal_accelerator_transfer_type_t type);
static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
opal_accelerator_transfer_type_t type);
static int mca_accelerator_rocm_malloc(int dev_id, void **ptr, size_t size);
Expand All @@ -50,6 +54,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =

mca_accelerator_rocm_memcpy_async,
mca_accelerator_rocm_memcpy,
mca_accelerator_rocm_matrix_memcpy,
mca_accelerator_rocm_memmove,
mca_accelerator_rocm_malloc,
mca_accelerator_rocm_free,
Expand Down Expand Up @@ -298,6 +303,46 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
return OPAL_SUCCESS;
}

static int mca_accelerator_rocm_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height,
opal_accelerator_transfer_type_t type)
{
hipError_t err;

if (NULL == dest || NULL == src || width <= 0 || height <= 0 ||
dpitch <= 0 || spitch <= 0) {
return OPAL_ERR_BAD_PARAM;
}

if (opal_accelerator_rocm_memcpy_async) {
err = HIP_FUNCS.hipMemcpy2DAsync(dest, dpitch, src, spitch, width, height,
hipMemcpyDefault, opal_accelerator_rocm_MemcpyStream);
if (hipSuccess != err ) {
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
"error starting async 2Dcopy\n");
return OPAL_ERROR;
}

err = HIP_FUNCS.hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
if (hipSuccess != err ) {
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
"error synchronizing stream after async 2Dcopy\n");
return OPAL_ERROR;
}
} else {
err = HIP_FUNCS.hipMemcpy2D(dest, dpitch, src, spitch, width, height,
hipMemcpyDefault);
if (hipSuccess != err ) {
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
"error during synchronous 2Dcopy\n");
return OPAL_ERROR;
}
}

return OPAL_SUCCESS;
}

static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest,
const void *src, size_t size,
opal_accelerator_transfer_type_t type)
Expand Down