From ac40b32f7f138f38278ff1f32f9c9a1780de10ef Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Tue, 25 Nov 2025 21:55:23 -0800
Subject: [PATCH] [flang][cuda][rt] Add entry point to get the allocation
 stream

---
 flang-rt/lib/cuda/allocator.cpp               | 11 +++-
 .../unittests/Runtime/CUDA/Allocatable.cpp    | 51 +++++++++++++++++++
 flang/include/flang/Runtime/CUDA/allocator.h  |  3 ++
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 5436051002265..d2aa832883e65 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -19,8 +19,6 @@
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Support/Fortran.h"
 
-#include "cuda_runtime.h"
-
 namespace Fortran::runtime::cuda {
 
 struct DeviceAllocation {
@@ -133,6 +131,15 @@ void RTDEF(CUFRegisterAllocator)() {
   allocatorRegistry.Register(
       kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
 }
+
+cudaStream_t RTDECL(CUFAssociatedGetStream)(void *p) {
+  int pos = findAllocation(p);
+  if (pos >= 0) {
+    cudaStream_t stream = deviceAllocations[pos].stream;
+    return stream;
+  }
+  return nullptr;
+}
 }
 
 void *CUFAllocPinned(
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index 9935ae0eaac2f..4e65326b31a62 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -121,3 +121,54 @@ TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
   cudaDeviceSynchronize();
   EXPECT_EQ(cudaSuccess, cudaGetLastError());
 }
+
+TEST(AllocatableAsyncTest, StreamDeviceAllocatable) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableAllocate)
+  (*a, /*asyncObject=*/(int64_t *)&stream, /*hasStat=*/false,
+      /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+  cudaStream_t s = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr);
+  EXPECT_EQ(s, stream);
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+  cudaDeviceSynchronize();
+
+  cudaStream_t defaultStream = 0;
+  RTNAME(AllocatableAllocate)
+  (*a, /*asyncObject=*/(int64_t *)&defaultStream, /*hasStat=*/false,
+      /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+  cudaStream_t d = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr);
+  EXPECT_EQ(d, defaultStream);
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+  cudaDeviceSynchronize();
+
+  RTNAME(AllocatableAllocate)
+  (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+  cudaStream_t empty = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr);
+  EXPECT_EQ(empty, nullptr);
+}
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 59fdb22b6e663..4e38482a7de30 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -13,11 +13,14 @@
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
 
+#include "cuda_runtime.h"
+
 namespace Fortran::runtime::cuda {
 
 extern "C" {
 
 void RTDECL(CUFRegisterAllocator)();
+cudaStream_t RTDECL(CUFAssociatedGetStream)(void *);
 }
 
 void *CUFAllocPinned(std::size_t, std::int64_t *);