From d400fe59a5b9beddea3bc4b2d4568ccd7cbcfa66 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Sat, 11 Oct 2025 12:09:27 -0700 Subject: [PATCH 1/2] [flang][cuda] Add interface and lowering for fence_proxy_async --- .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 15 +++++++++++++++ flang/module/cudadevice.f90 | 5 +++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 7 +++++++ 4 files changed, 28 insertions(+) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index ca02693c53aeb..d0a96a512c2e7 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -274,6 +274,7 @@ struct IntrinsicLibrary { llvm::ArrayRef); template mlir::Value genExtremum(mlir::Type, llvm::ArrayRef); + void genFenceProxyAsync(llvm::ArrayRef); mlir::Value genFloor(mlir::Type, llvm::ArrayRef); mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef args); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index c9cf6c23a81a5..4890225db452f 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -502,6 +502,10 @@ static constexpr IntrinsicHandler handlers[]{ &I::genExtendsTypeOf, {{{"a", asBox}, {"mold", asBox}}}, /*isElemental=*/false}, + {"fence_proxy_async", + &I::genFenceProxyAsync, + {}, + /*isElemental=*/false}, {"findloc", &I::genFindloc, {{{"array", asBox}, @@ -4354,6 +4358,17 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType, fir::getBase(args[1]))); } +// FENCE_PROXY_ASYNC (CUDA) +void IntrinsicLibrary::genFenceProxyAsync( + llvm::ArrayRef args) { + assert(args.size() == 0); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + // FINDLOC fir::ExtendedValue IntrinsicLibrary::genFindloc(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index e6c9e958af365..548298ef854c9 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -2008,6 +2008,11 @@ attributes(device) function barrier_arrive_cnt(barrier, count) result(token) end function end interface + interface + attributes(device) subroutine fence_proxy_async() + end subroutine + end interface + contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 1bf714010f5d3..378d8ddf65ad9 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -419,3 +419,10 @@ end subroutine ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> ! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 + +attributes(global) subroutine test_fence() + call fence_proxy_async() +end subroutine + +! CHECK-LABEL: func.func @_QPtest_fence() +! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind, space = #nvvm.shared_space} From 070d11ed9047d968c6b77692b9f5d443744f506f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Sat, 11 Oct 2025 12:27:03 -0700 Subject: [PATCH 2/2] Update flang/test/Lower/CUDA/cuda-device-proc.cuf --- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 49cd026c8fbef..50c0938a09cab 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -435,4 +435,4 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma() ! CHECK: nvvm.cp.async.bulk.commit.group -! CHECK: nvvm.cp.async.bulk.wait_group 0 \ No newline at end of file +! CHECK: nvvm.cp.async.bulk.wait_group 0