From ec8dfb57c1a611af07824a957e555f2a8ef204b2 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 3 Nov 2025 14:34:25 -0800 Subject: [PATCH] [flang][cuda] Use local scope to avoid duplicate definition --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 23 +++++++++++-------- flang/test/Lower/CUDA/cuda-device-proc.cuf | 4 ++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 15ea84565dd75..08ea965173fca 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3392,13 +3392,15 @@ IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, builder.setInsertionPointToStart(afterBlock); auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - mlir::Value ret = - mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], ns}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", - {}) - .getResult(0); + mlir::Value ret = mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], ns}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); mlir::scf::YieldOp::create(builder, loc, ret); builder.setInsertionPointAfter(whileOp); return whileOp.getResult(0); @@ -3413,8 +3415,11 @@ IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); return mlir::NVVM::InlinePtxOp::create( builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", {}) .getResult(0); } diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 09b4302446ee7..70c057d3b9143 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -515,7 +515,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait() ! CHECK: scf.while -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_barrier_try_wait_sleep() integer :: istat @@ -526,7 +526,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep() end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_tma_bulk_load_c4(a, n) integer(8), shared :: barrier1