UCT/CUDA_IPC: Add VMM/mallocasync support over fabric handles #9787

Akshay-Venkatesh · 2024-03-29T18:49:28Z

What

Allow VMM/Mallocasync created by user with FABRIC handles to take advantage of cuda-ipc transport

config/m4/cuda.m4

configure.ac

config/m4/cuda.m4

src/uct/cuda/cuda_copy/cuda_copy_md.c

src/uct/cuda/cuda_ipc/cuda_ipc_md.c

Akshay-Venkatesh · 2024-04-03T20:54:06Z

@rakhmets I had squash commits because build was complaining about one of the commit's formatting. Sorry about that but I've addressed your comments.

Akshay-Venkatesh · 2024-04-03T21:27:50Z

@yosefe / @brminich Are these actually wire compat issues here ?

Akshay-Venkatesh · 2024-04-03T22:04:49Z

@yosefe / @brminich Do you think the errors here are coming from the PR?

src/uct/cuda/cuda_copy/cuda_copy_md.c

config/m4/cuda.m4

configure.ac

src/uct/cuda/cuda_copy/cuda_copy_md.c

Akshay-Venkatesh · 2024-04-04T18:22:32Z

@rakhmets Seems like an unrelated error https://dev.azure.com/ucfconsort/ucx/_build/results?buildId=79118&view=logs&j=3af09b09-f681-502e-a77b-ab1dc5457b44&t=38d5da4a-ed53-5453-fe91-8988b34f7242&l=560

rakhmets · 2024-04-08T11:45:30Z

src/uct/cuda/cuda_copy/cuda_copy_md.c

+        (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {
+        *cuda_mem_type = CU_MEMORYTYPE_DEVICE;
+    } else {
+        status = UCS_ERR_INVALID_ADDR;


Please add error message, e.g.:

ucs_error("invalid memory location type %u for host memory type for address %p", prop.location.type, address);

This isn't actually an error. It's just that we don't want cuda_copy UCT to handle it. So I don't think we should print this error.

Maybe ucs_debug in this case. Other code paths returning error has debug log in this method.

src/uct/cuda/cuda_copy/cuda_copy_md.c

rakhmets · 2024-04-08T12:55:09Z

src/uct/cuda/cuda_copy/cuda_copy_md.c

+        return status;
+    }
+
+    if (cuda_mem_type != CU_MEMORYTYPE_DEVICE) {


Maybe it will be useful to add error message here as well. E.g.:

ucs_error("invalid memory type %u for address %p", cuda_mem_type, address);

I can add a trace if needed but it shouldn't be a ucs_error.

I see. How about ucs_debug?

src/uct/cuda/cuda_copy/cuda_copy_md.c

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

src/uct/cuda/cuda_ipc/cuda_ipc_md.c

Akshay-Venkatesh · 2024-04-09T20:11:01Z

@brminich @yosefe I've incorporated feedback from @rakhmets . Would appreciate reviews from you all as well.

brminich · 2024-04-18T13:08:47Z

src/uct/cuda/cuda_copy/cuda_copy_md.c

+    if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) ||
+        (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) ||
+        (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {
+        *cuda_mem_type = CU_MEMORYTYPE_DEVICE;


why do we modify it to device here?

@brminich This ensures that cuda_ipc considers buffers of this memory type to be transferred over nvlinks for instance.

brminich · 2024-04-18T13:11:43Z

src/uct/cuda/cuda_copy/cuda_copy_md.c

+        cu_err = cuCtxSetFlags(CU_CTX_SYNC_MEMOPS);
+        if (cu_err != CUDA_SUCCESS) {
+            ucs_warn("cuCtxSetFlags(CU_CTX_SYNC_MEMOPS) for %p error: %s",
+                     address, uct_cuda_base_cu_get_error_string(cu_err));
+        }


maybe use UCT_CUDADRV_FUNC here?

why do we set sync flag for the context for vmm only?

@brminich VMM memory cannot use SYNC_MEMOPS attribute and the mechanism exposed is the above API.

ok, what is about using UCT_CUDADRV_FUNC here?

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

brminich · 2024-04-18T13:37:42Z

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

+uct_cuda_ipc_open_memhandle_mempool(const uct_cuda_ipc_rkey_t *key,
+                                    CUdeviceptr *mapped_addr)
+{
+    CUmemoryPool *imported_mpool = 0;


Suggested change

CUmemoryPool *imported_mpool = 0;

CUmemoryPool *imported_mpool = NULL;

brminich · 2024-04-18T13:41:27Z

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

+ucs_status_t uct_cuda_ipc_close_memhandle(uct_cuda_ipc_cache_region_t *region)
+{
+    ucs_status_t status;
+#if HAVE_CUDA_FABRIC
+    if (region->key.ph.handle_type == UCT_CUDA_IPC_KEY_HANDLE_TYPE_VMM) {
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemUnmap(
+                    (CUdeviceptr)region->mapped_addr, region->key.b_len));
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAddressFree(
+                    (CUdeviceptr)region->mapped_addr, region->key.b_len));
+    } else if (region->key.ph.handle_type == UCT_CUDA_IPC_KEY_HANDLE_TYPE_LEGACY) {
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuIpcCloseMemHandle(
+                    (CUdeviceptr)region->mapped_addr));
+    } else {
+        /* Ideally we call cuMemFreeAsync on imported pointer region here but
+         * handles can be closed after device context has been destroyed which
+         * would cause cuMemFreeAsync to fail for a given stream */
+        status = UCS_OK;
+    }
+#else
+     status = UCT_CUDADRV_FUNC_LOG_ERR(cuIpcCloseMemHandle(
+                 (CUdeviceptr)region->mapped_addr));
+#endif
+
+     return status;
+}


Suggested change

ucs_status_t uct_cuda_ipc_close_memhandle(uct_cuda_ipc_cache_region_t *region)

{

ucs_status_t status;

#if HAVE_CUDA_FABRIC

if (region->key.ph.handle_type == UCT_CUDA_IPC_KEY_HANDLE_TYPE_VMM) {

status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemUnmap(

(CUdeviceptr)region->mapped_addr, region->key.b_len));

status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAddressFree(

(CUdeviceptr)region->mapped_addr, region->key.b_len));

} else if (region->key.ph.handle_type == UCT_CUDA_IPC_KEY_HANDLE_TYPE_LEGACY) {

status = UCT_CUDADRV_FUNC_LOG_ERR(cuIpcCloseMemHandle(

(CUdeviceptr)region->mapped_addr));

} else {

/* Ideally we call cuMemFreeAsync on imported pointer region here but

* handles can be closed after device context has been destroyed which

* would cause cuMemFreeAsync to fail for a given stream */

status = UCS_OK;

}

#else

status = UCT_CUDADRV_FUNC_LOG_ERR(cuIpcCloseMemHandle(

(CUdeviceptr)region->mapped_addr));

#endif

return status;

}

ucs_status_t uct_cuda_ipc_close_memhandle(uct_cuda_ipc_cache_region_t *region)

{

ucs_status_t status;

#if HAVE_CUDA_FABRIC

if (region->key.ph.handle_type == UCT_CUDA_IPC_KEY_HANDLE_TYPE_VMM) {

status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemUnmap(

(CUdeviceptr)region->mapped_addr, region->key.b_len));

status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAddressFree(

(CUdeviceptr)region->mapped_addr, region->key.b_len));

} else if (region->key.ph.handle_type != UCT_CUDA_IPC_KEY_HANDLE_TYPE_LEGACY) {

/* Ideally we call cuMemFreeAsync on imported pointer region here but

* handles can be closed after device context has been destroyed which

* would cause cuMemFreeAsync to fail for a given stream */

status = UCS_OK;

}

#endif

status = UCT_CUDADRV_FUNC_LOG_ERR(cuIpcCloseMemHandle(

(CUdeviceptr)region->mapped_addr));

return status;

}

brminich · 2024-04-18T13:45:22Z

src/uct/cuda/cuda_ipc/cuda_ipc_iface.c

+#if HAVE_CUDA_FABRIC
+    uct_device_type_t dev_type = UCT_DEVICE_TYPE_NET;
+#else
+    uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM;
+#endif


what if we always define it as NET?

@brminich We could do that but we should probably wait until rkey_unpack failures are gracefully handled as cuda_ipc would become eligible for all endpoints.

src/uct/cuda/cuda_ipc/cuda_ipc_iface.c

brminich · 2024-04-18T13:55:04Z

src/uct/cuda/cuda_ipc/cuda_ipc_md.c

    status = UCT_CUDADRV_FUNC(cuIpcGetMemHandle(&key->ph, (CUdeviceptr)addr),
                              UCS_LOG_LEVEL_ERROR);
-    if (UCS_OK != status) {
+    if (status != UCS_OK) {


minor: no need to change

brminich · 2024-04-23T10:50:00Z

src/uct/cuda/cuda_copy/cuda_copy_md.c

+                                                     alloc_handle->length, granularity,
+                                                     0, 0));
+    if (status != UCS_OK) {
+        return status;


we need to release resources allocated before (here and in other error flows of this func). I. e. use cuMemUnmap, cuMemAddressFree, etc

brminich · 2024-04-23T11:13:59Z

src/uct/cuda/cuda_copy/cuda_copy_md.c

+        cu_err = cuCtxSetFlags(CU_CTX_SYNC_MEMOPS);
+        if (cu_err != CUDA_SUCCESS) {
+            ucs_warn("cuCtxSetFlags(CU_CTX_SYNC_MEMOPS) for %p error: %s",
+                     address, uct_cuda_base_cu_get_error_string(cu_err));
+        }


ok, what is about using UCT_CUDADRV_FUNC here?

brminich · 2024-04-23T11:17:29Z

src/uct/cuda/cuda_copy/cuda_copy_md.h

+typedef enum {
+    UCT_CUDA_ALLOC_TYPE_FABRIC,
+    UCT_CUDA_ALLOC_TYPE_PINNED,
+    UCT_CUDA_ALLOC_TYPE_MANAGED,
+    UCT_CUDA_ALLOC_TYPE_LAST = UCT_CUDA_ALLOC_TYPE_MANAGED
+} uct_cuda_alloc_type_t;


maybe move it to cuda_cop_md.c?

I prefer to leave it here as it is referenced/used in cuda_ipc transport and having it in the header file makes it convenient.

brminich · 2024-04-23T11:25:19Z

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemUnmap(
+                    (CUdeviceptr)region->mapped_addr, region->key.b_len));


maybe add assert or check that status is UCS_OK here before reuse it

Akshay-Venkatesh requested review from yosefe, tvegas1, brminich and rakhmets March 29, 2024 18:49

rakhmets reviewed Apr 3, 2024

View reviewed changes

config/m4/cuda.m4 Outdated Show resolved Hide resolved

configure.ac Outdated Show resolved Hide resolved

config/m4/cuda.m4 Outdated Show resolved Hide resolved

rakhmets reviewed Apr 3, 2024

View reviewed changes

UCT/CUDA_IPC: Add VMM/mallocasync support over fabric handles

9acd2de

Akshay-Venkatesh force-pushed the topic/cuda-ipc-vmm-masync branch from 93e718c to 9acd2de Compare April 3, 2024 20:53

rakhmets reviewed Apr 4, 2024

View reviewed changes

src/uct/cuda/cuda_copy/cuda_copy_md.c Outdated Show resolved Hide resolved

config/m4/cuda.m4 Outdated Show resolved Hide resolved

configure.ac Outdated Show resolved Hide resolved

rakhmets reviewed Apr 4, 2024

View reviewed changes

src/uct/cuda/cuda_copy/cuda_copy_md.c Outdated Show resolved Hide resolved

UCT/CUDA_IPC: refactor vmm_detect fxn

1850471

rakhmets reviewed Apr 8, 2024

View reviewed changes

src/uct/cuda/cuda_ipc/cuda_ipc_md.c Outdated Show resolved Hide resolved

Akshay-Venkatesh modified the milestone: v1.12.1 Apr 8, 2024

UCT/CUDA_IPC: refactor open_memh; release vmm handle

02a9fb7

Akshay-Venkatesh added 4 commits April 16, 2024 17:34

UCT/CUDA_IPC: destroy mempool during cache cleanup

f1fdc8f

UCT/CUDA_IPC: check legacy handle in rpack first

f6ffacc

UCT/CUDA_IPC: use fabric memory bounce buffers by default

90bc608

UCT/CUDA_IPC: use derived md

688e161

brminich reviewed Apr 18, 2024

View reviewed changes

Akshay-Venkatesh added 2 commits April 22, 2024 03:12

UCT/CUDA_IPC: free mpool on err; refactor closehandle

06a8a0e

UCT/GDR_COPY: disallow pin_buffer on non-legacy memory

9288648

brminich reviewed Apr 23, 2024

View reviewed changes

UCT/CUDA_COPY: handle vmm_alloc failures

67fb671

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

UCT/CUDA_IPC: Add VMM/mallocasync support over fabric handles #9787

UCT/CUDA_IPC: Add VMM/mallocasync support over fabric handles #9787

Akshay-Venkatesh commented Mar 29, 2024

Akshay-Venkatesh commented Apr 3, 2024

Akshay-Venkatesh commented Apr 3, 2024

Akshay-Venkatesh commented Apr 3, 2024

Akshay-Venkatesh commented Apr 4, 2024

rakhmets Apr 8, 2024

Akshay-Venkatesh Apr 8, 2024

rakhmets Apr 9, 2024

rakhmets Apr 8, 2024

Akshay-Venkatesh Apr 8, 2024

rakhmets Apr 9, 2024

Akshay-Venkatesh commented Apr 9, 2024

brminich Apr 18, 2024

Akshay-Venkatesh Apr 22, 2024

brminich Apr 18, 2024

brminich Apr 18, 2024

Akshay-Venkatesh Apr 22, 2024

brminich Apr 23, 2024

brminich Apr 18, 2024

brminich Apr 18, 2024

brminich Apr 18, 2024

Akshay-Venkatesh Apr 22, 2024

brminich Apr 18, 2024

brminich Apr 23, 2024

brminich Apr 23, 2024

brminich Apr 23, 2024

Akshay-Venkatesh Apr 29, 2024

brminich Apr 23, 2024

	CUmemoryPool *imported_mpool = 0;
	CUmemoryPool *imported_mpool = NULL;

		status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemUnmap(
		(CUdeviceptr)region->mapped_addr, region->key.b_len));

UCT/CUDA_IPC: Add VMM/mallocasync support over fabric handles #9787

Are you sure you want to change the base?

UCT/CUDA_IPC: Add VMM/mallocasync support over fabric handles #9787

Conversation

Akshay-Venkatesh commented Mar 29, 2024

What

Akshay-Venkatesh commented Apr 3, 2024

Akshay-Venkatesh commented Apr 3, 2024

Akshay-Venkatesh commented Apr 3, 2024

Akshay-Venkatesh commented Apr 4, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Akshay-Venkatesh commented Apr 9, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment