Skip to content

Commit

Permalink
Merge pull request #9857 from yosefe/topic/ucm-cuda-test-add-memory-h…
Browse files Browse the repository at this point in the history
…ooks-for

UCM/CUDA/TEST: Add memory hooks for cuMemMap/Unmap
  • Loading branch information
yosefe committed May 4, 2024
2 parents 6e66c9d + 1406867 commit e11197c
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 40 deletions.
94 changes: 55 additions & 39 deletions src/ucm/cuda/cudamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,17 @@

/* Create a body of CUDA memory allocation replacement function */
#define UCM_CUDA_ALLOC_FUNC(_name, _mem_type, _retval, _success, _size, \
_ptr_type, _args_fmt, ...) \
_retval ucm_##_name(_ptr_type *ptr_p, UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
_ptr_type, _ref, _args_fmt, ...) \
_retval ucm_##_name(_ptr_type _ref ptr_arg, \
UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
{ \
_ptr_type ptr; \
_retval ret; \
\
ucm_event_enter(); \
ret = ucm_orig_##_name(ptr_p, UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
ret = ucm_orig_##_name(ptr_arg, UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
if (ret == (_success)) { \
ptr = *ptr_p; \
ptr = _ref ptr_arg; \
ucm_trace("%s(" _args_fmt ") allocated %p", __FUNCTION__, \
UCM_FUNC_PASS_ARGS(__VA_ARGS__), (void*)ptr); \
ucm_cuda_dispatch_mem_alloc((CUdeviceptr)ptr, (_size), \
Expand All @@ -45,16 +46,16 @@
}

/* Create a body of CUDA memory release replacement function */
#define UCM_CUDA_FREE_FUNC(_name, _mem_type, _retval, _ptr_arg, _args_fmt, \
...) \
#define UCM_CUDA_FREE_FUNC(_name, _mem_type, _retval, _ptr_arg, _size, \
_args_fmt, ...) \
_retval ucm_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
{ \
_retval ret; \
\
ucm_event_enter(); \
ucm_trace("%s(" _args_fmt ")", __FUNCTION__, \
UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
ucm_cuda_dispatch_mem_free((CUdeviceptr)(_ptr_arg), _mem_type, \
ucm_cuda_dispatch_mem_free((CUdeviceptr)(_ptr_arg), _size, _mem_type, \
#_name); \
ret = ucm_orig_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
ucm_event_leave(); \
Expand Down Expand Up @@ -84,6 +85,9 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr*,
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch_v2, CUresult, -1,
CUdeviceptr*, size_t*, size_t, size_t,
unsigned int)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemMap, CUresult, -1, CUdeviceptr, size_t,
size_t, CUmemGenericAllocationHandle,
unsigned long long)
#if CUDA_VERSION >= 11020
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocAsync, CUresult, -1, CUdeviceptr*,
size_t, CUstream)
Expand All @@ -94,6 +98,7 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree, CUresult, -1, CUdeviceptr)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemUnmap, CUresult, -1, CUdeviceptr, size_t)
#if CUDA_VERSION >= 11020
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeAsync, CUresult, -1, CUdeviceptr,
CUstream)
Expand Down Expand Up @@ -131,28 +136,30 @@ static void ucm_cuda_dispatch_mem_alloc(CUdeviceptr ptr, size_t length,
ucm_event_dispatch(UCM_EVENT_MEM_TYPE_ALLOC, &event);
}

static void ucm_cuda_dispatch_mem_free(CUdeviceptr ptr,
static void ucm_cuda_dispatch_mem_free(CUdeviceptr ptr, size_t length,
ucs_memory_type_t mem_type,
const char *func_name)
{
ucm_event_t event;
CUdeviceptr pbase;
size_t length;
CUresult ret;

if (ptr == 0) {
return;
}

ret = cuMemGetAddressRange(&pbase, &length, ptr);
if (ret == CUDA_SUCCESS) {
if (ptr != pbase) {
ucm_warn("%s(%p) called with unexpected pointer (expected: %p)",
func_name, (void*)ptr, (void*)pbase);
if (length == 0) {
/* If length is unknown, try to detect it */
ret = cuMemGetAddressRange(&pbase, &length, ptr);
if (ret == CUDA_SUCCESS) {
if (ptr != pbase) {
ucm_warn("%s(%p) called with unexpected pointer (expected: %p)",
func_name, (void*)ptr, (void*)pbase);
}
} else {
ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)ptr);
length = 1; /* set minimum length */
}
} else {
ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)ptr);
length = 1; /* set minimum length */
}

event.mem_type.address = (void*)ptr;
Expand All @@ -163,40 +170,46 @@ static void ucm_cuda_dispatch_mem_free(CUdeviceptr ptr,

/* Driver API replacements */
UCM_CUDA_ALLOC_FUNC(cuMemAlloc, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS,
arg0, CUdeviceptr, "size=%zu", size_t)
arg0, CUdeviceptr, *, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cuMemAlloc_v2, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS,
arg0, CUdeviceptr, "size=%zu", size_t)
arg0, CUdeviceptr, *, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cuMemAllocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, CUresult,
CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu flags=0x%x",
CUDA_SUCCESS, arg0, CUdeviceptr, *, "size=%zu flags=0x%x",
size_t, unsigned)
UCM_CUDA_ALLOC_FUNC(cuMemAllocPitch, UCS_MEMORY_TYPE_CUDA, CUresult,
CUDA_SUCCESS, (size_t)arg1 * arg2, CUdeviceptr,
CUDA_SUCCESS, ((size_t)arg1) * (arg2), CUdeviceptr, *,
"pitch=%p width=%zu height=%zu elem=%u", size_t*, size_t,
size_t, unsigned)
UCM_CUDA_ALLOC_FUNC(cuMemAllocPitch_v2, UCS_MEMORY_TYPE_CUDA, CUresult,
CUDA_SUCCESS, (size_t)arg1 * arg2, CUdeviceptr,
CUDA_SUCCESS, ((size_t)arg1) * (arg2), CUdeviceptr, *,
"pitch=%p width=%zu height=%zu elem=%u", size_t*, size_t,
size_t, unsigned)
UCM_CUDA_ALLOC_FUNC(cuMemMap, UCS_MEMORY_TYPE_UNKNOWN, CUresult, CUDA_SUCCESS,
arg0, CUdeviceptr, ,
"size=%zu offset=%zu handle=0x%llx flags=0x%llx", size_t,
size_t, CUmemGenericAllocationHandle, unsigned long long)
#if CUDA_VERSION >= 11020
UCM_CUDA_ALLOC_FUNC(cuMemAllocAsync, UCS_MEMORY_TYPE_CUDA_MANAGED, CUresult,
CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu stream=%p",
CUDA_SUCCESS, arg0, CUdeviceptr, *, "size=%zu stream=%p",
size_t, CUstream)
UCM_CUDA_ALLOC_FUNC(cuMemAllocFromPoolAsync, UCS_MEMORY_TYPE_CUDA_MANAGED,
CUresult, CUDA_SUCCESS, arg0, CUdeviceptr,
CUresult, CUDA_SUCCESS, arg0, CUdeviceptr, *,
"size=%zu pool=%p stream=%p", size_t, CUmemoryPool,
CUstream)
#endif
UCM_CUDA_FREE_FUNC(cuMemFree, UCS_MEMORY_TYPE_CUDA, CUresult, arg0,
UCM_CUDA_FREE_FUNC(cuMemFree, UCS_MEMORY_TYPE_CUDA, CUresult, arg0, 0,
"ptr=0x%llx", CUdeviceptr)
UCM_CUDA_FREE_FUNC(cuMemFree_v2, UCS_MEMORY_TYPE_CUDA, CUresult, arg0,
UCM_CUDA_FREE_FUNC(cuMemFree_v2, UCS_MEMORY_TYPE_CUDA, CUresult, arg0, 0,
"ptr=0x%llx", CUdeviceptr)
UCM_CUDA_FREE_FUNC(cuMemFreeHost, UCS_MEMORY_TYPE_HOST, CUresult, arg0,
UCM_CUDA_FREE_FUNC(cuMemFreeHost, UCS_MEMORY_TYPE_HOST, CUresult, arg0, 0,
"ptr=%p", void*)
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, UCS_MEMORY_TYPE_HOST, CUresult, arg0,
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, UCS_MEMORY_TYPE_HOST, CUresult, arg0, 0,
"ptr=%p", void*)
UCM_CUDA_FREE_FUNC(cuMemUnmap, UCS_MEMORY_TYPE_UNKNOWN, CUresult, arg0, arg1,
"ptr=%llx size=%zu", CUdeviceptr, size_t)
#if CUDA_VERSION >= 11020
UCM_CUDA_FREE_FUNC(cuMemFreeAsync, UCS_MEMORY_TYPE_CUDA_MANAGED, CUresult, arg0,
"ptr=0x%llx, stream=%p", CUdeviceptr, CUstream)
0, "ptr=0x%llx, stream=%p", CUdeviceptr, CUstream)
#endif

static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
Expand All @@ -205,6 +218,7 @@ static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
UCM_CUDA_FUNC_ENTRY(cuMemMap),
#if CUDA_VERSION >= 11020
UCM_CUDA_FUNC_ENTRY(cuMemAllocAsync),
UCM_CUDA_FUNC_ENTRY(cuMemAllocFromPoolAsync),
Expand All @@ -213,6 +227,7 @@ static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
UCM_CUDA_FUNC_ENTRY(cuMemUnmap),
#if CUDA_VERSION >= 11020
UCM_CUDA_FUNC_ENTRY(cuMemFreeAsync),
#endif
Expand All @@ -221,28 +236,29 @@ static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {

/* Runtime API replacements */
UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess,
arg0, void*, "size=%zu", size_t)
arg0, void*, *, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED,
cudaError_t, cudaSuccess, arg0, void*,
cudaError_t, cudaSuccess, arg0, void*, *,
"size=%zu flags=0x%x", size_t, unsigned)
UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t,
cudaSuccess, (size_t)arg1 * arg2, void*,
cudaSuccess, ((size_t)arg1) * (arg2), void*, *,
"pitch=%p width=%zu height=%zu", size_t*, size_t, size_t)
#if CUDA_VERSION >= 11020
UCM_CUDA_ALLOC_FUNC(cudaMallocAsync, UCS_MEMORY_TYPE_CUDA_MANAGED, cudaError_t,
cudaSuccess, arg0, void*, "size=%zu stream=%p", size_t,
cudaSuccess, arg0, void*, *, "size=%zu stream=%p", size_t,
cudaStream_t)
UCM_CUDA_ALLOC_FUNC(cudaMallocFromPoolAsync, UCS_MEMORY_TYPE_CUDA_MANAGED,
cudaError_t, cudaSuccess, arg0, void*, *,
"size=%zu pool=%p stream=%p", size_t, cudaMemPool_t,
cudaStream_t)
UCM_CUDA_ALLOC_FUNC(cudaMallocFromPoolAsync, UCS_MEMORY_TYPE_CUDA_MANAGED, cudaError_t,
cudaSuccess, arg0, void*, "size=%zu pool=%p stream=%p", size_t,
cudaMemPool_t, cudaStream_t)
#endif
UCM_CUDA_FREE_FUNC(cudaFree, UCS_MEMORY_TYPE_CUDA, cudaError_t, arg0,
UCM_CUDA_FREE_FUNC(cudaFree, UCS_MEMORY_TYPE_CUDA, cudaError_t, arg0, 0,
"devPtr=%p", void*)
UCM_CUDA_FREE_FUNC(cudaFreeHost, UCS_MEMORY_TYPE_HOST, cudaError_t, arg0,
UCM_CUDA_FREE_FUNC(cudaFreeHost, UCS_MEMORY_TYPE_HOST, cudaError_t, arg0, 0,
"ptr=%p", void*)
#if CUDA_VERSION >= 11020
UCM_CUDA_FREE_FUNC(cudaFreeAsync, UCS_MEMORY_TYPE_CUDA_MANAGED, cudaError_t,
arg0, "devPtr=%p, stream=%p", void*, cudaStream_t)
arg0, 0, "devPtr=%p, stream=%p", void*, cudaStream_t)
#endif

static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
Expand Down
4 changes: 4 additions & 0 deletions src/ucm/cuda/cudamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
CUresult ucm_cuMemAllocPitch_v2(CUdeviceptr *dptr, size_t *pPitch,
size_t WidthInBytes, size_t Height,
unsigned int ElementSizeBytes);
CUresult ucm_cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
CUmemGenericAllocationHandle handle,
unsigned long long flags);
#if CUDA_VERSION >= 11020
CUresult ucm_cuMemAllocAsync(CUdeviceptr *dptr, size_t size, CUstream hStream);
CUresult ucm_cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t size,
Expand All @@ -29,6 +32,7 @@ CUresult ucm_cuMemFree(CUdeviceptr dptr);
CUresult ucm_cuMemFree_v2(CUdeviceptr dptr);
CUresult ucm_cuMemFreeHost(void *p);
CUresult ucm_cuMemFreeHost_v2(void *p);
CUresult ucm_cuMemUnmap(CUdeviceptr ptr, size_t size);
CUresult ucm_cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);

cudaError_t ucm_cudaFree(void *devPtr);
Expand Down
45 changes: 44 additions & 1 deletion test/gtest/ucm/cuda_hooks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/
#include <ucm/api/ucm.h>
#include <ucs/debug/assert.h>
#include <ucs/sys/ptr_arith.h>
#include <common/test.h>
#include <cuda.h>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -88,6 +89,11 @@ class cuda_hooks : public ucs::test {
check_event_present(m_free_events, "free", ptr, size);
}

CUdevice device() const
{
return m_device;
}

private:
struct mem_event {
void *address;
Expand Down Expand Up @@ -115,7 +121,7 @@ class cuda_hooks : public ucs::test {
}
}

FAIL() << "Could not file memory " << name << " event for " << ptr
FAIL() << "Could not find memory " << name << " event for " << ptr
<< ".." << UCS_PTR_BYTE_OFFSET(ptr, size) << " type "
<< ucs_memory_type_names[mem_type];
}
Expand Down Expand Up @@ -257,6 +263,43 @@ UCS_TEST_F(cuda_hooks, test_cuMemAllocPitch) {
check_mem_free_events((void*)dptr, width * height);
}

UCS_TEST_F(cuda_hooks, test_cuMemMapUnmap) {
CUmemAllocationProp prop = {};
CUmemGenericAllocationHandle handle;
size_t size, granularity;
CUdeviceptr ptr;
CUresult ret;

ret = cuMemGetAllocationGranularity(&granularity, &prop,
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
ASSERT_EQ(ret, CUDA_SUCCESS);
size = ucs_align_up(256 * UCS_KBYTE, granularity);

prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = device();
ret = cuMemCreate(&handle, size, &prop, 0);
ASSERT_EQ(ret, CUDA_SUCCESS);

ret = cuMemAddressReserve(&ptr, size, 0, 0, 0);
ASSERT_EQ(ret, CUDA_SUCCESS);

ret = cuMemMap(ptr, size, 0, handle, 0);
ASSERT_EQ(ret, CUDA_SUCCESS);
check_mem_alloc_events((void*)ptr, size, UCS_MEMORY_TYPE_CUDA);

ret = cuMemUnmap(ptr, size);
ASSERT_EQ(ret, CUDA_SUCCESS);
check_mem_free_events((void*)ptr, size);

ret = cuMemAddressFree(ptr, size);
ASSERT_EQ(ret, CUDA_SUCCESS);

// Free the memory resources
ret = cuMemRelease(handle);
ASSERT_EQ(ret, CUDA_SUCCESS);
}

#if CUDA_VERSION >= 11020
UCS_TEST_F(cuda_hooks, test_cuMemAllocAsync) {
CUresult ret;
Expand Down

0 comments on commit e11197c

Please sign in to comment.