diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h index 3ab6f198236..7f3a2c56437 100644 --- a/opal/mca/accelerator/accelerator.h +++ b/opal/mca/accelerator/accelerator.h @@ -241,6 +241,29 @@ typedef int (*opal_accelerator_base_module_memcpy_fn_t)( int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +/** + * Copies a matrix of memory (height rows of width bytes) synchronously + * from src to dest. Memory of dest and src may not overlap. Optionally + * can specify the transfer type to avoid pointer detection for + * performance. + * + * @param[IN] dev_id Associated device to copy to/from + * @param[IN] dest Destination to copy memory to + * @param[IN] dpitch Pitch of destination memory + * @param[IN] src Source to copy memory from + * @param[IN] spitch Pitch of source memory + * @param[IN] width Width of matrix transfer (columns in bytes) + * @param[IN] height Height of matrix transfer (rows) + * @param[IN] type Transfer type field for performance + * Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC + * if caller is unsure of the transfer direction. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_matrix_memcpy_fn_t)( + int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, const void *src, size_t spitch, + size_t width, size_t height, opal_accelerator_transfer_type_t type); + /** * Copies memory synchronously from src to dest. Memory of dest and src * may overlap. Optionally can specify the transfer type to @@ -373,6 +396,7 @@ typedef struct { opal_accelerator_base_module_memcpy_async_fn_t memcpy_async; opal_accelerator_base_module_memcpy_fn_t memcpy; + opal_accelerator_base_module_matrix_memcpy_fn_t matrix_memcpy; opal_accelerator_base_module_memmove_fn_t memmove; opal_accelerator_base_module_malloc_fn_t malloc; diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 2cf617d267e..baad0aa3792 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -34,6 +34,10 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void * opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int accelerator_cuda_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + opal_accelerator_transfer_type_t type); static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int accelerator_cuda_malloc(int dev_id, void **ptr, size_t size); @@ -59,6 +63,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_memcpy_async, accelerator_cuda_memcpy, + accelerator_cuda_matrix_memcpy, accelerator_cuda_memmove, accelerator_cuda_malloc, accelerator_cuda_free, @@ -381,6 +386,112 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, return OPAL_SUCCESS; } +static int accelerator_cuda_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + opal_accelerator_transfer_type_t type) +{ + CUDA_MEMCPY2D copy = {0}; + CUmemorytype src_type, dest_type; + CUresult result; + + if (NULL == dest || NULL == src || width <= 0 || height <= 0) { + return OPAL_ERR_BAD_PARAM; + } + + switch (type) { + case MCA_ACCELERATOR_TRANSFER_HTOH: + { + src_type = CU_MEMORYTYPE_HOST; + dest_type = CU_MEMORYTYPE_HOST; + break; + } + case MCA_ACCELERATOR_TRANSFER_HTOD: + { + src_type = CU_MEMORYTYPE_HOST; + dest_type = CU_MEMORYTYPE_DEVICE; + break; + } + case MCA_ACCELERATOR_TRANSFER_DTOH: + { + src_type = CU_MEMORYTYPE_DEVICE; + dest_type = CU_MEMORYTYPE_HOST; + break; + } + case MCA_ACCELERATOR_TRANSFER_DTOD: + { + src_type = CU_MEMORYTYPE_DEVICE; + dest_type = CU_MEMORYTYPE_DEVICE; + break; + } + default: + result = opal_accelerator_cuda_func.cuPointerGetAttribute(&src_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) src); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + return OPAL_ERROR; + } + result = opal_accelerator_cuda_func.cuPointerGetAttribute(&dest_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) dest); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + return OPAL_ERROR; + } + } + + switch (src_type) { + case (CU_MEMORYTYPE_HOST): + { + copy.srcHost = src; + break; + } + case (CU_MEMORYTYPE_DEVICE): + { + copy.srcDevice = (CUdeviceptr) src; + break; + } + case (CU_MEMORYTYPE_UNIFIED): + { + copy.srcDevice = (CUdeviceptr) src; + break; + } + default: + opal_output(0, "CUDA: cuMemcpy2D failed: Unhandled memory type"); + return OPAL_ERROR; + } + + switch (dest_type) { + case (CU_MEMORYTYPE_HOST): + { + copy.dstHost = dest; + break; + } + case (CU_MEMORYTYPE_DEVICE): + { + copy.dstDevice = (CUdeviceptr) dest; + break; + } + case (CU_MEMORYTYPE_UNIFIED): + { + copy.dstDevice = (CUdeviceptr) dest; + break; + } + default: + opal_output(0, "CUDA: cuMemcpy2D failed: Unhandled memory type"); + return OPAL_ERROR; + } + + copy.srcMemoryType = src_type; + copy.srcPitch = spitch; + copy.dstMemoryType = dest_type; + copy.dstPitch = dpitch; + copy.WidthInBytes = width; + copy.Height = height; + result = opal_accelerator_cuda_func.cuMemcpy2D(©); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemcpy2D failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return result; + } + return OPAL_SUCCESS; +} + static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type) { diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c index 1f68ea0e6ff..229e19763e1 100644 --- a/opal/mca/accelerator/null/accelerator_null_component.c +++ b/opal/mca/accelerator/null/accelerator_null_component.c @@ -48,6 +48,10 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void * opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int accelerator_null_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + opal_accelerator_transfer_type_t type); static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); @@ -111,6 +115,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_memcpy_async, accelerator_null_memcpy, + accelerator_null_matrix_memcpy, accelerator_null_memmove, accelerator_null_malloc, accelerator_null_free, @@ -191,6 +196,14 @@ static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, return OPAL_SUCCESS; } +static int accelerator_null_matrix_memcpy(void *dest, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + opal_accelerator_transfer_type_t type) +{ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type) { diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index e86e4a4939e..98986a6cc99 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -24,6 +24,10 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int mca_accelerator_rocm_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_malloc(int dev_id, void **ptr, size_t size); @@ -50,6 +54,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_memcpy_async, mca_accelerator_rocm_memcpy, + mca_accelerator_rocm_matrix_memcpy, mca_accelerator_rocm_memmove, mca_accelerator_rocm_malloc, mca_accelerator_rocm_free, @@ -298,6 +303,46 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de return OPAL_SUCCESS; } +static int mca_accelerator_rocm_matrix_memcpy(int dest_dev_id, int src_dev_id, void *dest, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + opal_accelerator_transfer_type_t type) +{ + hipError_t err; + + if (NULL == dest || NULL == src || width <= 0 || height <= 0 || + dpitch <= 0 || spitch <= 0) { + return OPAL_ERR_BAD_PARAM; + } + + if (opal_accelerator_rocm_memcpy_async) { + err = HIP_FUNCS.hipMemcpy2DAsync(dest, dpitch, src, spitch, width, height, + hipMemcpyDefault, opal_accelerator_rocm_MemcpyStream); + if (hipSuccess != err ) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error starting async 2Dcopy\n"); + return OPAL_ERROR; + } + + err = HIP_FUNCS.hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream); + if (hipSuccess != err ) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error synchronizing stream after async 2Dcopy\n"); + return OPAL_ERROR; + } + } else { + err = HIP_FUNCS.hipMemcpy2D(dest, dpitch, src, spitch, width, height, + hipMemcpyDefault); + if (hipSuccess != err ) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error during synchronous 2Dcopy\n"); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} + static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type)