From e6b50060a32af276b0d79ba608d8c44e9e34137e Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 3 Nov 2025 15:19:00 -0800 Subject: [PATCH] [flang][cuda] Switch to inline ptx for barrier_arrive --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 13 ++++++------- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 15ea84565dd75..e52b752e1a8bb 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3345,13 +3345,12 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, assert(args.size() == 2); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); - // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and - // currently just the sink symbol `_`. - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive - mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1], - {}); - return fir::LoadOp::create(builder, loc, token); + return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, + {barrier, args[1]}, {}, + "mbarrier.arrive.expect_tx.release." + "cta.shared::cta.b64 %0, [%1], %2;", + {}) + .getResult(0); } // BARRIER_INIT (CUDA) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 09b4302446ee7..eba1bf95e88fd 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -440,7 +440,7 @@ end subroutine ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64 attributes(global) subroutine test_fence()