diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 8a3d07043013e..9c8895272f661 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -889,10 +889,7 @@ def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomple }]; } -def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">, - Arguments<(ins - AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, - I32:$txcount, PtxPredicate:$predicate)> { +def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx"> { let summary = "MBarrier Arrive with Expected Transaction Count"; let description = [{ The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation @@ -903,11 +900,11 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t threads within the CTA. When other threads perform corresponding acquire operations (like 'mbarrier.test.wait'), they synchronize with this release pattern. - This operation first performs an expect-tx operation with the specified transaction - count, then performs an arrive-on operation with an implicit count of 1. The - expect-tx operation increases the tx-count of the *mbarrier object* by the specified - expectCount value, setting the current phase to expect and tracks the completion - of additional asynchronous transactions. + This operation first performs an expect-tx operation with the specified transaction + count, then performs an arrive-on operation with an implicit count of 1. The + expect-tx operation increases the expect-count of the *mbarrier object* by the + specified value (i.e. `txcount`), setting the current phase to expect and track + the completion of additional asynchronous transactions. The operation takes the following operands: - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic @@ -915,11 +912,86 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t - `txcount`: An unsigned integer specifying the expected transaction count for the expect-tx operation. This represents the number of asynchronous transactions expected to complete before the barrier phase completes. - - `predicate`: Optional predicate for conditional execution. + - `scope`: This specifies the set of threads that directly observe the memory + synchronizing effect of the `mbarrier.test.wait` operation. + - `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics + and does not provide any ordering or visibility guarantees. + - `predicate`: Optional predicate for conditional execution used only when lowering to + inline-ptx. - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop) + }]; + + let results = (outs Optional:$res); + let arguments = (ins + AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr, + I32:$txcount, + DefaultValuedAttr:$scope, + DefaultValuedAttr:$relaxed, + PtxPredicate:$predicate); + + let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands) (`->` type($res)^)?"; + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool hasIntrinsic() { return !getPredicate(); } + + bool getAsmValues(RewriterBase &rewriter, + llvm::SmallVectorImpl> &asmValues); + + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + + int addrSpace = llvm::cast(op.getAddr().getType()).getAddressSpace(); + if (addrSpace != static_cast(NVVM::NVVMMemorySpace::SharedCluster)) + $res = createIntrinsicCall(builder, id, args); + else + createIntrinsicCall(builder, id, args); + }]; +} + +def NVVM_MBarrierArriveDropExpectTxOp : NVVM_Op<"mbarrier.arrive_drop.expect_tx"> { + let summary = "MBarrier arrive_drop with expected transaction count"; + let description = [{ + The `nvvm.mbarrier.arrive_drop.expect_tx` operation is similar to the + `nvvm.mbarrier.arrive.expect_tx` operation except that it performs an + `arrive_drop` operation instead of only an `arrive` operation. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop) + }]; + + let results = (outs Optional:$res); + let arguments = (ins + AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr, + I32:$txcount, + DefaultValuedAttr:$scope, + DefaultValuedAttr:$relaxed); + + let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands) (`->` type($res)^)?"; + let hasVerifier = 1; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + + int addrSpace = llvm::cast(op.getAddr().getType()).getAddressSpace(); + if (addrSpace != static_cast(NVVM::NVVMMemorySpace::SharedCluster)) + $res = createIntrinsicCall(builder, id, args); + else + createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; } def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">, diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index 3a70f787da124..64a7f562af0e5 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -923,7 +923,11 @@ struct NVGPUMBarrierArriveExpectTxLowering adaptor.getMbarId(), rewriter); Value txcount = truncToI32(b, adaptor.getTxcount()); rewriter.replaceOpWithNewOp( - op, barrier, txcount, adaptor.getPredicate()); + op, Type{}, // return-value is optional and is void by default + barrier, txcount, // barrier and txcount + NVVM::MemScopeKind::CTA, // default scope is CTA + false, // relaxed-semantics is false + adaptor.getPredicate()); return success(); } }; diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index d3c305555fde8..1dcd4244c014c 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -274,6 +274,34 @@ LogicalResult MBarrierArriveDropOp::verify() { getRes()); } +LogicalResult MBarrierArriveExpectTxOp::verify() { + // The inline-ptx version of this Op does not support all features. + // With predicate, this Op lowers to inline-ptx. So, verify and + // error-out if there are unsupported features. + if (getPredicate()) { + if (getScope() != NVVM::MemScopeKind::CTA) + return emitError("mbarrier scope must be CTA when using predicate"); + + if (isPtrInSharedClusterSpace(getAddr())) + return emitError("mbarrier in shared_cluster space is not supported when " + "using predicate"); + + if (getRes()) + return emitError("return-value is not supported when using predicate"); + + if (getRelaxed() == true) + return emitError("mbarrier with relaxed semantics is not supported when " + "using predicate"); + } + return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(), + getRes()); +} + +LogicalResult MBarrierArriveDropExpectTxOp::verify() { + return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(), + getRes()); +} + LogicalResult MBarrierExpectTxOp::verify() { return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope()); } @@ -2576,6 +2604,87 @@ mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs( return {id, {mbar, count}}; } +bool MBarrierArriveExpectTxOp::getAsmValues( + RewriterBase &rewriter, + llvm::SmallVectorImpl> + &asmValues) { + // Add all the operands but not the attrs to the asmValues list. + // The attrs here are used to generate the right variants for + // intrinsics-lowering. So, we ignore them while generating inline-PTX. + for (auto val : getOperands()) + asmValues.push_back({val, mlir::NVVM::PTXRegisterMod::Read}); + + return false; +} + +mlir::NVVM::IDArgPair MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + + bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr()); + bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER; + // bit-0: Space + // bit-1: Scope + size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0); + + // clang-format off + static constexpr llvm::Intrinsic::ID IDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cluster}; + static constexpr llvm::Intrinsic::ID relaxedIDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cluster}; + // clang-format on + auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index]; + + // Tidy-up the Intrinsic Args + llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount()); + llvm::Value *mbar = mt.lookupValue(thisOp.getAddr()); + bool needCast = isPtrInGenericSpace(thisOp.getAddr()); + if (needCast) + mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared); + + return {id, {mbar, txcount}}; +} + +mlir::NVVM::IDArgPair MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + + bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr()); + bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER; + // bit-0: Space + // bit-1: Scope + size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0); + + // clang-format off + static constexpr llvm::Intrinsic::ID IDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cluster}; + static constexpr llvm::Intrinsic::ID relaxedIDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cluster}; + // clang-format on + auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index]; + + // Tidy-up the Intrinsic Args + llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount()); + llvm::Value *mbar = mt.lookupValue(thisOp.getAddr()); + bool needCast = isPtrInGenericSpace(thisOp.getAddr()); + if (needCast) + mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared); + + return {id, {mbar, txcount}}; +} + mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast(op); diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index a94fcb4856db4..fbf8d9efb3bc7 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -16,8 +16,6 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou // CHECK-LABEL: @init_mbarrier_arrive_expect_tx llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) { - //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r" - nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b" nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 llvm.return @@ -25,8 +23,6 @@ llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i3 // CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r" - nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b" nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1 llvm.return @@ -544,8 +540,8 @@ func.func @elect_one_leader_sync() { // ----- -// CHECK-LABEL: @init_mbarrier_arrive_expect_tx -llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) { +// CHECK-LABEL: @test_nvvm_prefetch +llvm.func @test_nvvm_prefetch(%desc : !llvm.ptr, %pred : i1) { //CHECK: nvvm.prefetch tensormap, %{{.*}} nvvm.prefetch tensormap, %desc : !llvm.ptr //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b" diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_arr_drop_expect_tx.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_arr_drop_expect_tx.mlir new file mode 100644 index 0000000000000..4b3cafec08a39 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_arr_drop_expect_tx.mlir @@ -0,0 +1,68 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @mbarrier_arrive_drop_expect_tx_generic(%barrier: !llvm.ptr, %txcount : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_generic(ptr %0, i32 %1) { + // CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %3, i32 %1) + // CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1) + // CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %7, i32 %1) + // CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %9, i32 %1) + // CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 %1) + // CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %13, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i64 + %1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr, i32 -> i64 + %2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr, i32 -> i64 + + %3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr, i32 -> i64 + %4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr, i32 -> i64 + %5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr, i32 -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_drop_expect_tx_shared(%barrier: !llvm.ptr<3>, %txcount : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared(ptr addrspace(3) %0, i32 %1) { + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 -> i64 + %1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 -> i64 + %2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 -> i64 + + %3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<3>, i32 -> i64 + %4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3>, i32 -> i64 + %5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3>, i32 -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_drop_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %txcount : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + + nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7>, i32 + llvm.return +} + diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_arr_expect_tx.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_arr_expect_tx.mlir new file mode 100644 index 0000000000000..b5389bdd30267 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_arr_expect_tx.mlir @@ -0,0 +1,68 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @mbarrier_arrive_expect_tx_generic(%barrier: !llvm.ptr, %txcount : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_expect_tx_generic(ptr %0, i32 %1) { + // CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %3, i32 %1) + // CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1) + // CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %7, i32 %1) + // CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %9, i32 %1) + // CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 %1) + // CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %13, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i64 + %1 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr, i32 -> i64 + %2 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr, i32 -> i64 + + %3 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr, i32 -> i64 + %4 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr, i32 -> i64 + %5 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr, i32 -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_expect_tx_shared(%barrier: !llvm.ptr<3>, %txcount : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_expect_tx_shared(ptr addrspace(3) %0, i32 %1) { + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 -> i64 + %1 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 -> i64 + %2 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 -> i64 + + %3 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<3>, i32 -> i64 + %4 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3>, i32 -> i64 + %5 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3>, i32 -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %txcount : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7>, i32 + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7>, i32 + llvm.return +} + diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir index 4ad76248b7e25..2bb90943d4ce1 100644 --- a/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir @@ -47,3 +47,68 @@ llvm.func @mbarrier_complete_tx_scope(%barrier: !llvm.ptr<3>, %tx_count: i32) { nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 llvm.return } + +// ----- + +llvm.func @mbarrier_arr_expect_tx(%barrier: !llvm.ptr<3>, %tx_count: i32) { + // expected-error @below {{mbarrier scope must be either CTA or Cluster}} + %1 = nvvm.mbarrier.arrive.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 -> i64 + llvm.return +} + +// ----- + +llvm.func @mbarrier_arr_expect_tx_cluster(%barrier: !llvm.ptr<7>, %tx_count: i32) { + // expected-error @below {{mbarrier in shared_cluster space cannot return any value}} + %1 = nvvm.mbarrier.arrive.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 -> i64 + llvm.return +} + +// ----- + +llvm.func @init_mbarrier_arrive_expect_tx_asm_ret(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) { + // expected-error @below {{return-value is not supported when using predicate}} + %1 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 -> i64 + llvm.return +} + +// ----- + +llvm.func @init_mbarrier_arrive_expect_tx_asm_relaxed(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) { + // expected-error @below {{mbarrier with relaxed semantics is not supported when using predicate}} + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred {relaxed = true} : !llvm.ptr<3>, i32, i1 + llvm.return +} + +// ----- + +llvm.func @init_mbarrier_arrive_expect_tx_asm_cta(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) { + // expected-error @below {{mbarrier scope must be CTA when using predicate}} + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32, i1 + llvm.return +} + +// ----- + +llvm.func @init_mbarrier_arrive_expect_tx_asm_cluster(%barrier : !llvm.ptr<7>, %txcount : i32, %pred : i1) { + // expected-error @below {{mbarrier in shared_cluster space is not supported when using predicate}} + nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<7>, i32, i1 + llvm.return +} + +// ----- + +llvm.func @mbarrier_arr_drop_expect_tx(%barrier: !llvm.ptr<3>, %tx_count: i32) { + // expected-error @below {{mbarrier scope must be either CTA or Cluster}} + %1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 -> i64 + llvm.return +} + +// ----- + +llvm.func @mbarrier_arr_drop_expect_tx_cluster(%barrier: !llvm.ptr<7>, %tx_count: i32) { + // expected-error @below {{mbarrier in shared_cluster space cannot return any value}} + %1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 -> i64 + llvm.return +} +