Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions config/opal_check_cuda.m4
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
dnl -*- autoconf -*-
dnl
dnl Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
dnl University Research and Technology
dnl Corporation. All rights reserved.
Expand Down Expand Up @@ -118,6 +119,12 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"],
[#include <$opal_cuda_incdir/cuda.h>])],
[])

# If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA.
AS_IF([test "$opal_check_cuda_happy"="yes"],
[AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0],
[#include <$opal_cuda_incdir/cuda.h>])],
[])

# If we have CUDA support, check to see if we have support for SYNC_MEMOPS
# which was first introduced in CUDA 6.0.
AS_IF([test "$opal_check_cuda_happy" = "yes"],
Expand Down Expand Up @@ -160,6 +167,10 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
[Whether we want cuda device pointer support])

AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"])
AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT,
[Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available])

AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])
AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS,
[Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available])
Expand Down
136 changes: 129 additions & 7 deletions opal/mca/accelerator/cuda/accelerator_cuda.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/*
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -77,9 +78,93 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
accelerator_cuda_get_buffer_id
};

static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
int *dev_id)
{
#if OPAL_CUDA_VMM_SUPPORT
static int device_count = -1;
CUmemAllocationProp prop;
CUmemLocation location;
CUresult result;
unsigned long long flags;
CUmemGenericAllocationHandle alloc_handle;

if (device_count == -1) {
result = cuDeviceGetCount(&device_count);
if (result != CUDA_SUCCESS) {
return 0;
}
}

result = cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf);
if (result != CUDA_SUCCESS) {
return 0;
}

result = cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle);
if (result != CUDA_SUCCESS) {
cuMemRelease(alloc_handle);
return 0;
}

if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
*mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = prop.location.id;
cuMemRelease(alloc_handle);
return 1;
}

if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
/* check if device has access */
for (int i = 0; i < device_count; i++) {
location.type = CU_MEM_LOCATION_TYPE_DEVICE;
location.id = i;
result = cuMemGetAccess(&flags, &location, dbuf);
if ((CUDA_SUCCESS == result) &&
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
*mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = i;
cuMemRelease(alloc_handle);
return 1;
}
}
}

/* host must have access as device access possibility is exhausted */
*mem_type = CU_MEMORYTYPE_HOST;
*dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
cuMemRelease(alloc_handle);
return 1;

#endif

return 0;
}

static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
/* query the device from the context */
int dev_id = -1;
CUdevice ptr_dev;
cuCtxPushCurrent(mem_ctx);
cuCtxGetDevice(&ptr_dev);
for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
CUdevice dev;
cuDeviceGet(&dev, i);
if (dev == ptr_dev) {
dev_id = i;
break;
}
}
cuCtxPopCurrent(&mem_ctx);
return dev_id;
}

static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
{
CUresult result;
int is_vmm = 0;
int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
CUmemorytype vmm_mem_type = 0;
CUmemorytype mem_type = 0;
CUdeviceptr dbuf = (CUdeviceptr) addr;
CUcontext ctx = NULL, mem_ctx = NULL;
Expand All @@ -91,6 +176,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *

*flags = 0;

is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);

#if OPAL_CUDA_GET_ATTRIBUTES
uint32_t is_managed = 0;
/* With CUDA 7.0, we can get multiple attributes with a single call */
Expand Down Expand Up @@ -120,14 +207,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
return OPAL_ERROR;
}
} else if (CU_MEMORYTYPE_HOST == mem_type) {
/* Host memory, nothing to do here */
return 0;
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = vmm_dev_id;
} else {
/* Host memory, nothing to do here */
return 0;
}
} else if (0 == mem_type) {
/* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
return 0;
} else {
if (is_vmm) {
*dev_id = vmm_dev_id;
} else {
/* query the device from the context */
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
}
}
/* Must be a device pointer */
assert(CU_MEMORYTYPE_DEVICE == mem_type);
#else /* OPAL_CUDA_GET_ATTRIBUTES */
result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
if (CUDA_SUCCESS != result) {
Expand All @@ -138,12 +235,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
return OPAL_ERROR;
}
} else if (CU_MEMORYTYPE_HOST == mem_type) {
/* Host memory, nothing to do here */
return 0;
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = vmm_dev_id;
} else {
/* Host memory, nothing to do here */
return 0;
}
} else {
if (is_vmm) {
*dev_id = vmm_dev_id;
} else {
result = cuPointerGetAttribute(&mem_ctx,
CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
/* query the device from the context */
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
}
}
#endif /* OPAL_CUDA_GET_ATTRIBUTES */

/* Must be a device pointer */
assert(CU_MEMORYTYPE_DEVICE == mem_type);
#endif /* OPAL_CUDA_GET_ATTRIBUTES */

/* This piece of code was added in to handle in a case involving
* OMP threads. The user had initialized CUDA and then spawned
Expand All @@ -166,6 +278,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
return OPAL_ERROR;
}
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
if (is_vmm) {
/* This function is expected to set context if pointer is device
* accessible but VMM allocations have NULL context associated
* which cannot be set against the calling thread */
opal_output(0,
"CUDA: unable to set context with the given pointer"
"ptr=%p aborting...", addr);
return OPAL_ERROR;
}

result = cuCtxSetCurrent(mem_ctx);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_output(0,
Expand Down
2 changes: 2 additions & 0 deletions opal/mca/accelerator/cuda/accelerator_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp

OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;

OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;

OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);

END_C_DECLS
Expand Down