From ac40b32f7f138f38278ff1f32f9c9a1780de10ef Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Tue, 25 Nov 2025 21:55:23 -0800 Subject: [PATCH] [flang][cuda][rt] Add entry point to get the allocation stream --- flang-rt/lib/cuda/allocator.cpp | 11 +++- .../unittests/Runtime/CUDA/Allocatable.cpp | 51 +++++++++++++++++++ flang/include/flang/Runtime/CUDA/allocator.h | 3 ++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index 5436051002265..d2aa832883e65 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -19,8 +19,6 @@ #include "flang/Runtime/CUDA/common.h" #include "flang/Support/Fortran.h" -#include "cuda_runtime.h" - namespace Fortran::runtime::cuda { struct DeviceAllocation { @@ -133,6 +131,15 @@ void RTDEF(CUFRegisterAllocator)() { allocatorRegistry.Register( kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified}); } + +cudaStream_t RTDECL(CUFAssociatedGetStream)(void *p) { + int pos = findAllocation(p); + if (pos >= 0) { + cudaStream_t stream = deviceAllocations[pos].stream; + return stream; + } + return nullptr; +} } void *CUFAllocPinned( diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp index 9935ae0eaac2f..4e65326b31a62 100644 --- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp +++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp @@ -121,3 +121,54 @@ TEST(AllocatableCUFTest, StreamDeviceAllocatable) { cudaDeviceSynchronize(); EXPECT_EQ(cudaSuccess, cudaGetLastError()); } + +TEST(AllocatableAsyncTest, StreamDeviceAllocatable) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + a->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); + EXPECT_FALSE(a->HasAddendum()); + RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); + + cudaStream_t stream; + cudaStreamCreate(&stream); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableAllocate) + (*a, /*asyncObject=*/(int64_t *)&stream, /*hasStat=*/false, + /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + cudaStream_t s = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr); + EXPECT_EQ(s, stream); + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); + cudaDeviceSynchronize(); + + cudaStream_t defaultStream = 0; + RTNAME(AllocatableAllocate) + (*a, /*asyncObject=*/(int64_t *)&defaultStream, /*hasStat=*/false, + /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + cudaStream_t d = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr); + EXPECT_EQ(d, defaultStream); + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); + cudaDeviceSynchronize(); + + RTNAME(AllocatableAllocate) + (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, + __LINE__); + EXPECT_TRUE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + cudaStream_t empty = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr); + EXPECT_EQ(empty, nullptr); +} diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index 59fdb22b6e663..4e38482a7de30 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -13,11 +13,14 @@ #include "flang/Runtime/descriptor-consts.h" #include "flang/Runtime/entry-names.h" +#include "cuda_runtime.h" + namespace Fortran::runtime::cuda { extern "C" { void RTDECL(CUFRegisterAllocator)(); +cudaStream_t RTDECL(CUFAssociatedGetStream)(void *); } void *CUFAllocPinned(std::size_t, std::int64_t *);