Skip to content

Commit

Permalink
[Libomptarget] Fix RPC-based malloc on NVPTX (#72440)
Browse files Browse the repository at this point in the history
Summary:
The device allocator on NVPTX architectures is enqueued to a stream that
the kernel is potentially executing on. This can lead to deadlocks as
the kernel will not proceed until the allocation is complete and the
allocation will not proceed until the kernel is complete. CUDA 11.2
introduced async allocations that we can manually place on separate
streams to combat this. This patch makes a new allocation type that's
guaranteed to be non-blocking so it will actually make progress, only
Nvidia needs to care about this as the others are not blocking in this
way by default.

I had originally tried to make the `alloc` and `free` methods take a
`__tgt_async_info`. However, I observed that with the large volume of
streams being created by a parallel test it quickly locked up the system
as presumably too many streams were being created. This implementation
not just creates a new stream and immediately destroys it. This
obviously isn't very fast, but it at least gets the cases to stop
deadlocking for now.
  • Loading branch information
jhuber6 committed Jan 2, 2024
1 parent 41a07e6 commit fb32977
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 6 deletions.
4 changes: 3 additions & 1 deletion openmp/libomptarget/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ enum TargetAllocTy : int32_t {
TARGET_ALLOC_DEVICE = 0,
TARGET_ALLOC_HOST,
TARGET_ALLOC_SHARED,
TARGET_ALLOC_DEFAULT
TARGET_ALLOC_DEFAULT,
/// The allocation will not block on other streams.
TARGET_ALLOC_DEVICE_NON_BLOCKING,
};

inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2112,6 +2112,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
Expand Down Expand Up @@ -3315,6 +3316,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
Expand Down
7 changes: 3 additions & 4 deletions openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);

// Register a custom opcode handler to perform plugin specific allocation.
// FIXME: We need to make sure this uses asynchronous allocations on CUDA.
auto MallocHandler = [](rpc_port_t Port, void *Data) {
rpc_recv_and_send(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Buffer->data[0] = reinterpret_cast<uintptr_t>(
Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
},
Data);
};
Expand All @@ -88,7 +87,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
TARGET_ALLOC_DEVICE);
TARGET_ALLOC_DEVICE_NON_BLOCKING);
},
Data);
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ DLWRAP(cuLaunchKernel, 11)
DLWRAP(cuMemAlloc, 2)
DLWRAP(cuMemAllocHost, 2)
DLWRAP(cuMemAllocManaged, 3)
DLWRAP(cuMemAllocAsync, 3)

DLWRAP(cuMemcpyDtoDAsync, 4)
DLWRAP(cuMemcpyDtoH, 3)
Expand All @@ -52,6 +53,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)

DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
DLWRAP(cuMemFreeAsync, 2)

DLWRAP(cuModuleGetFunction, 3)
DLWRAP(cuModuleGetGlobal, 4)

Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
CUresult cuMemAlloc(CUdeviceptr *, size_t);
CUresult cuMemAllocHost(void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);

CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
Expand All @@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);

CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);

CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
Expand Down
27 changes: 27 additions & 0 deletions openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option) {}
#endif

#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
// Forward declarations of asynchronous memory management functions. This is
// necessary for older versions of CUDA.
CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }

CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
#endif

/// Class implementing the CUDA device images properties.
struct CUDADeviceImageTy : public DeviceImageTy {
/// Create the CUDA image with the id and the target image pointer.
Expand Down Expand Up @@ -488,6 +496,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
MemAlloc = (void *)DevicePtr;
break;
case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
CUstream Stream;
if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
break;
if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
break;
cuStreamSynchronize(Stream);
Res = cuStreamDestroy(Stream);
MemAlloc = (void *)DevicePtr;
}
}

if (auto Err =
Expand Down Expand Up @@ -518,6 +536,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_HOST:
Res = cuMemFreeHost(TgtPtr);
break;
case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
CUstream Stream;
if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
break;
cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
cuStreamSynchronize(Stream);
if ((Res = cuStreamDestroy(Stream)))
break;
}
}

if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemAlloc = std::malloc(Size);
break;
}
Expand Down
10 changes: 9 additions & 1 deletion openmp/libomptarget/test/libc/malloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ int main() {
unsigned *d_x;
#pragma omp target map(from : d_x)
{
d_x = malloc(sizeof(unsigned));
d_x = (unsigned *)malloc(sizeof(unsigned));
*d_x = 1;
}

Expand All @@ -23,6 +23,14 @@ int main() {
#pragma omp target is_device_ptr(d_x)
{ free(d_x); }

#pragma omp target teams num_teams(64)
#pragma omp parallel num_threads(32)
{
int *ptr = (int *)malloc(sizeof(int));
*ptr = 42;
free(ptr);
}

// CHECK: PASS
if (h_x == 1)
fputs("PASS\n", stdout);
Expand Down

0 comments on commit fb32977

Please sign in to comment.