open-mpi · janjust · Sep 17, 2024 · Aug 28, 2024
diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
@@ -1,5 +1,6 @@
 dnl -*- autoconf -*-
 dnl
+dnl Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
 dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 dnl                         University Research and Technology
 dnl                         Corporation.  All rights reserved.
@@ -118,6 +119,12 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"],
         [#include <$opal_cuda_incdir/cuda.h>])],
     [])
 
+# If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA.
+AS_IF([test "$opal_check_cuda_happy"="yes"],
+    [AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0],
+        [#include <$opal_cuda_incdir/cuda.h>])],
+    [])
+
 # If we have CUDA support, check to see if we have support for SYNC_MEMOPS
 # which was first introduced in CUDA 6.0.
 AS_IF([test "$opal_check_cuda_happy" = "yes"],
@@ -160,6 +167,10 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
 AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
                    [Whether we want cuda device pointer support])
 
+AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT,
+                   [Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available])
+
 AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])
 AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS,
                    [Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available])

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2014-2015 Intel, Inc.  All rights reserved.
  * Copyright (c) 2014      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
@@ -77,9 +78,93 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_buffer_id
 };
 
+static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
+                                      int *dev_id)
+{
+#if OPAL_CUDA_VMM_SUPPORT
+    static int device_count = -1;
+    CUmemAllocationProp prop;
+    CUmemLocation location;
+    CUresult result;
+    unsigned long long flags;
+    CUmemGenericAllocationHandle alloc_handle;
+
+    if (device_count == -1) {
+        result = cuDeviceGetCount(&device_count);
+        if (result != CUDA_SUCCESS) {
+            return 0;
+        }
+    }
+
+    result = cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf);
+    if (result != CUDA_SUCCESS) {
+        return 0;
+    }
+
+    result = cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle);
+    if (result != CUDA_SUCCESS) {
+        cuMemRelease(alloc_handle);
+        return 0;
+    }
+
+    if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
+        *mem_type = CU_MEMORYTYPE_DEVICE;
+        *dev_id  = prop.location.id;
+        cuMemRelease(alloc_handle);
+        return 1;
+    }
+
+    if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
+        /* check if device has access */
+        for (int i = 0; i < device_count; i++) {
+            location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            location.id   = i;
+            result = cuMemGetAccess(&flags, &location, dbuf);
+            if ((CUDA_SUCCESS == result) &&
+                (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
+                *mem_type = CU_MEMORYTYPE_DEVICE;
+                *dev_id  = i;
+                cuMemRelease(alloc_handle);
+                return 1;
+            }
+        }
+    }
+
+    /* host must have access as device access possibility is exhausted */
+    *mem_type = CU_MEMORYTYPE_HOST;
+    *dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+    cuMemRelease(alloc_handle);
+    return 1;
+
+#endif
+
+    return 0;
+}
+
+static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
+    /* query the device from the context */
+    int dev_id = -1;
+    CUdevice ptr_dev;
+    cuCtxPushCurrent(mem_ctx);
+    cuCtxGetDevice(&ptr_dev);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        cuDeviceGet(&dev, i);
+        if (dev == ptr_dev) {
+            dev_id = i;
+            break;
+        }
+    }
+    cuCtxPopCurrent(&mem_ctx);
+    return dev_id;
+}
+
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
+    int is_vmm = 0;
+    int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+    CUmemorytype vmm_mem_type = 0;
     CUmemorytype mem_type = 0;
     CUdeviceptr dbuf = (CUdeviceptr) addr;
     CUcontext ctx = NULL, mem_ctx = NULL;
@@ -91,6 +176,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
 
     *flags = 0;
 
+    is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
+
 #if OPAL_CUDA_GET_ATTRIBUTES
     uint32_t is_managed = 0;
     /* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -120,14 +207,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
             return OPAL_ERROR;
         }
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
-        /* Host memory, nothing to do here */
-        return 0;
+        if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
+            mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id = vmm_dev_id;
+        } else {
+            /* Host memory, nothing to do here */
+            return 0;
+        }
     } else if (0 == mem_type) {
         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
         return 0;
+    } else {
+        if (is_vmm) {
+            *dev_id = vmm_dev_id;
+        } else {
+            /* query the device from the context */
+            *dev_id = accelerator_cuda_get_device_id(mem_ctx);
+        }
     }
-    /* Must be a device pointer */
-    assert(CU_MEMORYTYPE_DEVICE == mem_type);
 #else /* OPAL_CUDA_GET_ATTRIBUTES */
     result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
     if (CUDA_SUCCESS != result) {
@@ -138,12 +235,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
             return OPAL_ERROR;
         }
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
-        /* Host memory, nothing to do here */
-        return 0;
+        if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
+            mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id = vmm_dev_id;
+        } else {
+            /* Host memory, nothing to do here */
+            return 0;
+        }
+    } else {
+        if (is_vmm) {
+            *dev_id = vmm_dev_id;
+        } else {
+            result = cuPointerGetAttribute(&mem_ctx,
+                                           CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+            /* query the device from the context */
+            *dev_id = accelerator_cuda_get_device_id(mem_ctx);
+        }
     }
+#endif /* OPAL_CUDA_GET_ATTRIBUTES */
+
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
-#endif /* OPAL_CUDA_GET_ATTRIBUTES */
 
     /* This piece of code was added in to handle in a case involving
      * OMP threads.  The user had initialized CUDA and then spawned
@@ -166,6 +278,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                 return OPAL_ERROR;
             }
 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
+            if (is_vmm) {
+                /* This function is expected to set context if pointer is device
+                 * accessible but VMM allocations have NULL context associated
+                 * which cannot be set against the calling thread */
+                opal_output(0,
+                        "CUDA: unable to set context with the given pointer"
+                        "ptr=%p aborting...", addr);
+                return OPAL_ERROR;
+            }
+
             result = cuCtxSetCurrent(mem_ctx);
             if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
                 opal_output(0,

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp
 
 OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
+OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
+
 OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
 
 END_C_DECLS