diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 476a158019d3c..d5602eec0d07c 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -103,7 +103,9 @@ enum TargetAllocTy : int32_t { TARGET_ALLOC_DEVICE = 0, TARGET_ALLOC_HOST, TARGET_ALLOC_SHARED, - TARGET_ALLOC_DEFAULT + TARGET_ALLOC_DEFAULT, + /// The allocation will not block on other streams. + TARGET_ALLOC_DEVICE_NON_BLOCKING, }; inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index fe435a3f55855..0411c67013342 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2112,6 +2112,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_DEVICE_NON_BLOCKING: MemoryPool = CoarseGrainedMemoryPools[0]; break; case TARGET_ALLOC_HOST: @@ -3315,6 +3316,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_DEVICE_NON_BLOCKING: MemoryPool = CoarseGrainedMemoryPools[0]; break; case TARGET_ALLOC_HOST: diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp index 60e0540e96022..54aced11b31c3 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp @@ -62,15 +62,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, "Failed to initialize RPC server for device %d: %d", DeviceId, Err); // Register a custom opcode handler to perform plugin specific allocation. - // FIXME: We need to make sure this uses asynchronous allocations on CUDA. auto MallocHandler = [](rpc_port_t Port, void *Data) { rpc_recv_and_send( Port, [](rpc_buffer_t *Buffer, void *Data) { plugin::GenericDeviceTy &Device = *reinterpret_cast(Data); - Buffer->data[0] = reinterpret_cast( - Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE)); + Buffer->data[0] = reinterpret_cast(Device.allocate( + Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING)); }, Data); }; @@ -88,7 +87,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, plugin::GenericDeviceTy &Device = *reinterpret_cast(Data); Device.free(reinterpret_cast(Buffer->data[0]), - TARGET_ALLOC_DEVICE); + TARGET_ALLOC_DEVICE_NON_BLOCKING); }, Data); }; diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 56c4404ac2d5c..5ec3adb9e4e3a 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -43,6 +43,7 @@ DLWRAP(cuLaunchKernel, 11) DLWRAP(cuMemAlloc, 2) DLWRAP(cuMemAllocHost, 2) DLWRAP(cuMemAllocManaged, 3) +DLWRAP(cuMemAllocAsync, 3) DLWRAP(cuMemcpyDtoDAsync, 4) DLWRAP(cuMemcpyDtoH, 3) @@ -52,6 +53,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4) DLWRAP(cuMemFree, 1) DLWRAP(cuMemFreeHost, 1) +DLWRAP(cuMemFreeAsync, 2) + DLWRAP(cuModuleGetFunction, 3) DLWRAP(cuModuleGetGlobal, 4) diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h index 3e0307759924b..32031c28f8797 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, CUresult cuMemAlloc(CUdeviceptr *, size_t); CUresult cuMemAllocHost(void **, size_t); CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); +CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream); CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); @@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); CUresult cuMemFree(CUdeviceptr); CUresult cuMemFreeHost(void *); +CUresult cuMemFreeAsync(CUdeviceptr, CUstream); CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index b0dff917dd0be..0005bff7a8035 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -63,6 +63,14 @@ cuMemGetAllocationGranularity(size_t *granularity, CUmemAllocationGranularity_flags option) {} #endif +#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020)) +// Forward declarations of asynchronous memory management functions. This is +// necessary for older versions of CUDA. +CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; } + +CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {} +#endif + /// Class implementing the CUDA device images properties. struct CUDADeviceImageTy : public DeviceImageTy { /// Create the CUDA image with the id and the target image pointer. @@ -488,6 +496,16 @@ struct CUDADeviceTy : public GenericDeviceTy { Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); MemAlloc = (void *)DevicePtr; break; + case TARGET_ALLOC_DEVICE_NON_BLOCKING: { + CUstream Stream; + if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) + break; + if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream))) + break; + cuStreamSynchronize(Stream); + Res = cuStreamDestroy(Stream); + MemAlloc = (void *)DevicePtr; + } } if (auto Err = @@ -518,6 +536,15 @@ struct CUDADeviceTy : public GenericDeviceTy { case TARGET_ALLOC_HOST: Res = cuMemFreeHost(TgtPtr); break; + case TARGET_ALLOC_DEVICE_NON_BLOCKING: { + CUstream Stream; + if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) + break; + cuMemFreeAsync(reinterpret_cast(TgtPtr), Stream); + cuStreamSynchronize(Stream); + if ((Res = cuStreamDestroy(Stream))) + break; + } } if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) { diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp index 88b5236d31f48..43569f2505559 100644 --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -215,6 +215,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy { case TARGET_ALLOC_DEVICE: case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: + case TARGET_ALLOC_DEVICE_NON_BLOCKING: MemAlloc = std::malloc(Size); break; } diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c index c18a724930f41..b587b618472e4 100644 --- a/openmp/libomptarget/test/libc/malloc.c +++ b/openmp/libomptarget/test/libc/malloc.c @@ -13,7 +13,7 @@ int main() { unsigned *d_x; #pragma omp target map(from : d_x) { - d_x = malloc(sizeof(unsigned)); + d_x = (unsigned *)malloc(sizeof(unsigned)); *d_x = 1; } @@ -23,6 +23,14 @@ int main() { #pragma omp target is_device_ptr(d_x) { free(d_x); } +#pragma omp target teams num_teams(64) +#pragma omp parallel num_threads(32) + { + int *ptr = (int *)malloc(sizeof(int)); + *ptr = 42; + free(ptr); + } + // CHECK: PASS if (h_x == 1) fputs("PASS\n", stdout);