From bd90caae947c8a17e6aedb15f2faa55cf63b65f1 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Thu, 20 Nov 2025 00:38:21 +0530 Subject: [PATCH] [MLIR][NVVM] Update mbarrier.arrive.* Op This patch updates the mbarrier.arrive.* family of Ops to include all features added up-to Blackwell. * Update the `mbarrier.arrive` Op to include shared_cluster memory space, cta/cluster scope and an option to lower using relaxed semantics. * An `arrive_drop` variant is added for both the `arrive` and `arrive.nocomplete` operations. * Verifier checks are added wherever appropriate. * lit tests are added to verify the lowering to the intrinsics. TODO: * Updates for the remaining mbarrier family will be done in subsequent PRs (mainly arrive.expect-tx and {test/try}waits). Signed-off-by: Durgadoss R --- .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 2 +- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 188 +++++++++++++++-- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 191 +++++++++++++++++- mlir/test/Target/LLVMIR/nvvm/mbar_arrive.mlir | 103 ++++++++++ .../Target/LLVMIR/nvvm/mbar_arrive_drop.mlir | 103 ++++++++++ .../Target/LLVMIR/nvvm/mbar_complete_tx.mlir | 29 +++ .../Target/LLVMIR/nvvm/mbar_expect_tx.mlir | 29 +++ .../nvvm/{mbarriers.mlir => mbar_init.mlir} | 40 ---- .../test/Target/LLVMIR/nvvm/mbar_invalid.mlir | 49 +++++ 9 files changed, 676 insertions(+), 58 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/nvvm/mbar_arrive.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/mbar_arrive_drop.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/mbar_complete_tx.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/mbar_expect_tx.mlir rename mlir/test/Target/LLVMIR/nvvm/{mbarriers.mlir => mbar_init.mlir} (66%) create mode 100644 mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index f67129dfa6730..acb6c645b9c72 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -892,7 +892,7 @@ CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier) - .getResult(); + .getResult(0); } // BARRIER_ARRIBVE_CNT diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 524b9f820f290..57cb9b139111d 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -638,9 +638,76 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, }]; } -def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, - Results<(outs I64:$res)>, - Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> { +def NVVM_MBarrierExpectTxOp : NVVM_Op<"mbarrier.expect_tx"> { + let summary = "MBarrier expect-tx Operation"; + let description = [{ + The `nvvm.mbarrier.expect_tx` operation increases the transaction count + of the mbarrier located at `addr` by `txcount` amount. The `scope` + specifies the set of threads that can directly observe the memory + synchronizing effect of the `mbarrier.expect_tx` operation. `CTA` + and `CLUSTER` are the only allowed values for `scope`. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx) + }]; + + let arguments = (ins + AnyTypeOf<[LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr, + I32:$txcount, + DefaultValuedAttr:$scope); + + let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands)"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::MBarrierExpectTxOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); + }]; +} + +def NVVM_MBarrierCompleteTxOp : NVVM_Op<"mbarrier.complete_tx"> { + let summary = "MBarrier complete-tx Operation"; + let description = [{ + The `nvvm.mbarrier.complete_tx` operation decrements the transaction + count of the *mbarrier object* at `addr` by `txcount`. It also signals + the completion of asynchronous transactions that were tracked by the + current phase. The `scope` specifies the set of threads that can directly + observe the memory synchronizing effect of the `mbarrier.complete_tx` + operation. `CTA` and `CLUSTER` are the only allowed values for `scope`. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx) + }]; + + let arguments = (ins + AnyTypeOf<[LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr, + I32:$txcount, + DefaultValuedAttr:$scope); + + let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands)"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::MBarrierCompleteTxOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); + }]; +} + +def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive"> { let summary = "MBarrier Arrive Operation"; let description = [{ The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the @@ -652,19 +719,40 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, with this release pattern. This operation causes the executing thread to signal its arrival at the barrier. - The operation returns an opaque value that captures the phase of the - *mbarrier object* prior to the arrive-on operation. The contents of this state - value are implementation-specific. - The operation takes the following operand: + - `res`: When the `space` is not shared_cluster, this operation returns an + opaque 64-bit value capturing the phase of the *mbarrier object* prior to + the arrive-on operation. The contents of this return value are + implementation-specific. An *mbarrier object* located in the shared_cluster + space cannot return a value. + + The operation takes the following operands: - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` - must be a pointer to generic or shared::cta memory. When it is generic, the - underlying address must be within the shared::cta memory space; otherwise - the behavior is undefined. + must be a pointer to generic or shared_cta or shared_cluster memory. When it + is generic, the underlying address must be within the shared_cta memory space; + otherwise the behavior is undefined. + - `count`: This specifies the amount by which the pending arrival count is + decremented. If the `count` argument is not specified, the pending arrival + count is decremented by 1. + - `scope`: This specifies the set of threads that directly observe the memory + synchronizing effect of the `mbarrier.arrive` operation. + - `space`: This indicates the memory space where the mbarrier object resides. + - `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics + and does not provide any ordering or visibility guarantees. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) }]; - let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)"; + + let results = (outs Optional:$res); + let arguments = (ins + AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr, + Optional:$count, + DefaultValuedAttr:$scope, + DefaultValuedAttr:$relaxed); + + let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?"; + + let hasVerifier = 1; let extraClassDeclaration = [{ static mlir::NVVM::IDArgPair @@ -675,7 +763,54 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, string llvmBuilder = [{ auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs( *op, moduleTranslation, builder); - $res = createIntrinsicCall(builder, id, args); + + int addrSpace = llvm::cast(op.getAddr().getType()).getAddressSpace(); + if (addrSpace != static_cast(NVVM::NVVMMemorySpace::SharedCluster)) + $res = createIntrinsicCall(builder, id, args); + else + createIntrinsicCall(builder, id, args); + }]; +} + +def NVVM_MBarrierArriveDropOp : NVVM_Op<"mbarrier.arrive_drop"> { + let summary = "MBarrier Arrive-Drop Operation"; + let description = [{ + The `nvvm.mbarrier.arrive_drop` operation decrements the expected arrival + count of the *mbarrier object* by `count` and then performs an arrive-on + operation. When `count` is not specified, it defaults to 1. The decrement + of the expected arrival count applies to all the subsequent phases of the + *mbarrier object*. The remaining semantics are identical to those of the + `nvvm.mbarrier.arrive` operation. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop) + }]; + + let results = (outs Optional:$res); + let arguments = (ins + AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr, + Optional:$count, + DefaultValuedAttr:$scope, + DefaultValuedAttr:$relaxed); + + let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::MBarrierArriveDropOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + + int addrSpace = llvm::cast(op.getAddr().getType()).getAddressSpace(); + if (addrSpace != static_cast(NVVM::NVVMMemorySpace::SharedCluster)) + $res = createIntrinsicCall(builder, id, args); + else + createIntrinsicCall(builder, id, args); }]; } @@ -725,6 +860,35 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">, }]; } +def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomplete">, + Results<(outs I64:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I32:$count)> { + let summary = "MBarrier Arrive-Drop No-Complete Operation"; + let description = [{ + The `nvvm.mbarrier.arrive_drop.nocomplete` operation decrements the expected + arrival count of the *mbarrier object* by the amount `count` and then performs + an arrive-on operation on the *mbarrier object* with the guarantee that it + will not cause the barrier to complete its current phase. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop) + }]; + + let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); + }]; +} + def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">, Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 0e620737109b8..e9949547aaea4 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -57,10 +57,26 @@ static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) { return ptrTy.getAddressSpace() == static_cast(targetAS); } +static bool isPtrInGenericSpace(mlir::Value ptr) { + return isPtrInAddrSpace(ptr, NVVMMemorySpace::Generic); +} + static bool isPtrInSharedCTASpace(mlir::Value ptr) { return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared); } +static bool isPtrInSharedClusterSpace(mlir::Value ptr) { + return isPtrInAddrSpace(ptr, NVVMMemorySpace::SharedCluster); +} + +static llvm::Value *castPtrToAddrSpace(llvm::IRBuilderBase &builder, + llvm::Value *ptr, + NVVMMemorySpace targetAS) { + unsigned AS = static_cast(targetAS); + return builder.CreateAddrSpaceCast( + ptr, llvm::PointerType::get(builder.getContext(), AS)); +} + // Helper method to convert CtaGroupKind in NVVM Dialect to CtaGroupKind in LLVM static llvm::nvvm::CTAGroupKind getNVVMCtaGroupKind(NVVM::CTAGroupKind ctaGroup) { @@ -233,6 +249,39 @@ LogicalResult CpAsyncBulkGlobalToSharedClusterOp::verify() { return success(); } +static LogicalResult verifyMBarrierArriveLikeOp(Operation *op, Value addr, + NVVM::MemScopeKind scope, + Value retVal = nullptr) { + bool isSharedCluster = isPtrInSharedClusterSpace(addr); + if (scope != NVVM::MemScopeKind::CTA && scope != NVVM::MemScopeKind::CLUSTER) + return op->emitError("mbarrier scope must be either CTA or Cluster"); + + bool hasRetValue = static_cast(retVal); + if (isSharedCluster && hasRetValue) + return op->emitError( + "mbarrier in shared_cluster space cannot return any value"); + + return success(); +} + +LogicalResult MBarrierArriveOp::verify() { + return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(), + getRes()); +} + +LogicalResult MBarrierArriveDropOp::verify() { + return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(), + getRes()); +} + +LogicalResult MBarrierExpectTxOp::verify() { + return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope()); +} + +LogicalResult MBarrierCompleteTxOp::verify() { + return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope()); +} + LogicalResult ConvertFloatToTF32Op::verify() { using RndMode = NVVM::FPRoundingMode; switch (getRnd()) { @@ -1874,15 +1923,132 @@ mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs( return {id, {mt.lookupValue(thisOp.getAddr())}}; } +mlir::NVVM::IDArgPair MBarrierExpectTxOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + + bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr()); + bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER; + // bit-0: Space + // bit-1: Scope + size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0); + + static constexpr llvm::Intrinsic::ID IDs[] = { + llvm::Intrinsic::nvvm_mbarrier_expect_tx_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_expect_tx_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_expect_tx_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_expect_tx_scope_cluster_space_cluster}; + + // Fill the Intrinsic Args + llvm::SmallVector args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getTxcount())); + + return {IDs[index], std::move(args)}; +} + +mlir::NVVM::IDArgPair MBarrierCompleteTxOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + + bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr()); + bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER; + // bit-0: Space + // bit-1: Scope + size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0); + + static constexpr llvm::Intrinsic::ID IDs[] = { + llvm::Intrinsic::nvvm_mbarrier_complete_tx_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_complete_tx_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_complete_tx_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_complete_tx_scope_cluster_space_cluster}; + + // Fill the Intrinsic Args + llvm::SmallVector args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getTxcount())); + + return {IDs[index], std::move(args)}; +} + mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast(op); - bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); - llvm::Intrinsic::ID id = isShared - ? llvm::Intrinsic::nvvm_mbarrier_arrive_shared - : llvm::Intrinsic::nvvm_mbarrier_arrive; - return {id, {mt.lookupValue(thisOp.getAddr())}}; + bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr()); + bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER; + // bit-0: Space + // bit-1: Scope + size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0); + + static constexpr llvm::Intrinsic::ID IDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cluster_space_cluster}; + static constexpr llvm::Intrinsic::ID relaxedIDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cluster_space_cta, + llvm::Intrinsic:: + nvvm_mbarrier_arrive_relaxed_scope_cluster_space_cluster}; + auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index]; + + // Tidy-up the Intrinsic Args + bool needCast = isPtrInGenericSpace(thisOp.getAddr()); + llvm::Value *mbar = mt.lookupValue(thisOp.getAddr()); + if (needCast) + mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared); + + // When count is not explicitly specified, the default is 1. + llvm::LLVMContext &ctx = mt.getLLVMContext(); + bool hasCount = static_cast(thisOp.getCount()); + llvm::Value *count = + hasCount ? mt.lookupValue(thisOp.getCount()) + : llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1); + + return {id, {mbar, count}}; +} + +mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + + bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr()); + bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER; + // bit-0: Space + // bit-1: Scope + size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0); + + static constexpr llvm::Intrinsic::ID IDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cta_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cta_space_cluster, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cluster_space_cta, + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cluster_space_cluster}; + static constexpr llvm::Intrinsic::ID relaxedIDs[] = { + llvm::Intrinsic::nvvm_mbarrier_arrive_drop_relaxed_scope_cta_space_cta, + llvm::Intrinsic:: + nvvm_mbarrier_arrive_drop_relaxed_scope_cta_space_cluster, + llvm::Intrinsic:: + nvvm_mbarrier_arrive_drop_relaxed_scope_cluster_space_cta, + llvm::Intrinsic:: + nvvm_mbarrier_arrive_drop_relaxed_scope_cluster_space_cluster}; + auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index]; + + // Tidy-up the Intrinsic Args + bool needCast = isPtrInGenericSpace(thisOp.getAddr()); + llvm::Value *mbar = mt.lookupValue(thisOp.getAddr()); + if (needCast) + mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared); + + // When count is not explicitly specified, the default is 1. + llvm::LLVMContext &ctx = mt.getLLVMContext(); + bool hasCount = static_cast(thisOp.getCount()); + llvm::Value *count = + hasCount ? mt.lookupValue(thisOp.getCount()) + : llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1); + + return {id, {mbar, count}}; } mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( @@ -1900,6 +2066,21 @@ mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( return {id, std::move(args)}; } +mlir::NVVM::IDArgPair MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = + isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_drop_noComplete_shared + : llvm::Intrinsic::nvvm_mbarrier_arrive_drop_noComplete; + // Fill the Intrinsic Args + llvm::SmallVector args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getCount())); + + return {id, std::move(args)}; +} + mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast(op); diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_arrive.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_arrive.mlir new file mode 100644 index 0000000000000..6e7e1636c1de5 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_arrive.mlir @@ -0,0 +1,103 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @mbarrier_arrive_generic(%barrier: !llvm.ptr, %count : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_generic(ptr %0, i32 %1) { + // CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cta.space.cta(ptr addrspace(3) %3, i32 1) + // CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1) + // CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cta.space.cta(ptr addrspace(3) %7, i32 %1) + // CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cluster.space.cta(ptr addrspace(3) %9, i32 %1) + // CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 1) + // CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cta(ptr addrspace(3) %13, i32 %1) + // CHECK-NEXT: %15 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %16 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cta(ptr addrspace(3) %15, i32 %1) + // CHECK-NEXT: %17 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %18 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cluster.space.cta(ptr addrspace(3) %17, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr -> i64 + %1 = nvvm.mbarrier.arrive %barrier, %count : !llvm.ptr -> i64 + %2 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr -> i64 + %3 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr -> i64 + + %4 = nvvm.mbarrier.arrive %barrier {relaxed = true} : !llvm.ptr -> i64 + %5 = nvvm.mbarrier.arrive %barrier, %count {relaxed = true} : !llvm.ptr -> i64 + %6 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr -> i64 + %7 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>, %count : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_shared(ptr addrspace(3) %0, i32 %1) { + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cta.space.cta(ptr addrspace(3) %0, i32 1) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 1) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %9 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64 + %1 = nvvm.mbarrier.arrive %barrier, %count : !llvm.ptr<3> -> i64 + %2 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<3> -> i64 + %3 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<3> -> i64 + + %4 = nvvm.mbarrier.arrive %barrier {relaxed = true} : !llvm.ptr<3> -> i64 + %5 = nvvm.mbarrier.arrive %barrier, %count {relaxed = true} : !llvm.ptr<3> -> i64 + %6 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3> -> i64 + %7 = nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3> -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_shared_cluster(%barrier: !llvm.ptr<7>, %count : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_shared_cluster(ptr addrspace(7) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.scope.cta.space.cluster(ptr addrspace(7) %0, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.arrive %barrier : !llvm.ptr<7> + nvvm.mbarrier.arrive %barrier, %count : !llvm.ptr<7> + nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<7> + nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<7> + + nvvm.mbarrier.arrive %barrier {relaxed = true} : !llvm.ptr<7> + nvvm.mbarrier.arrive %barrier, %count {relaxed = true} : !llvm.ptr<7> + nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7> + nvvm.mbarrier.arrive %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7> + llvm.return +} + +llvm.func @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) { + // CHECK-LABEL: define void @mbarrier_arrive_nocomplete(ptr %0) { + // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %0, i32 %2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %count = nvvm.read.ptx.sreg.ntid.x : i32 + %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr, i32 -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) { + // CHECK-LABEL: define void @mbarrier_arrive_nocomplete_shared(ptr addrspace(3) %0) { + // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %0, i32 %2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %count = nvvm.read.ptx.sreg.ntid.x : i32 + %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32 -> i64 + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_arrive_drop.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_arrive_drop.mlir new file mode 100644 index 0000000000000..c345c5d69edad --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_arrive_drop.mlir @@ -0,0 +1,103 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @mbarrier_arrive_drop_generic(%barrier: !llvm.ptr, %count : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_generic(ptr %0, i32 %1) { + // CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cta(ptr addrspace(3) %3, i32 1) + // CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1) + // CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cta(ptr addrspace(3) %7, i32 %1) + // CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cluster.space.cta(ptr addrspace(3) %9, i32 %1) + // CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 1) + // CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cta(ptr addrspace(3) %13, i32 %1) + // CHECK-NEXT: %15 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %16 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cta(ptr addrspace(3) %15, i32 %1) + // CHECK-NEXT: %17 = addrspacecast ptr %0 to ptr addrspace(3) + // CHECK-NEXT: %18 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cluster.space.cta(ptr addrspace(3) %17, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive_drop %barrier : !llvm.ptr -> i64 + %1 = nvvm.mbarrier.arrive_drop %barrier, %count : !llvm.ptr -> i64 + %2 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr -> i64 + %3 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr -> i64 + + %4 = nvvm.mbarrier.arrive_drop %barrier {relaxed = true} : !llvm.ptr -> i64 + %5 = nvvm.mbarrier.arrive_drop %barrier, %count {relaxed = true} : !llvm.ptr -> i64 + %6 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr -> i64 + %7 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_drop_shared(%barrier: !llvm.ptr<3>, %count : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_shared(ptr addrspace(3) %0, i32 %1) { + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cta(ptr addrspace(3) %0, i32 1) + // CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 1) + // CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %9 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %0 = nvvm.mbarrier.arrive_drop %barrier : !llvm.ptr<3> -> i64 + %1 = nvvm.mbarrier.arrive_drop %barrier, %count : !llvm.ptr<3> -> i64 + %2 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<3> -> i64 + %3 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<3> -> i64 + + %4 = nvvm.mbarrier.arrive_drop %barrier {relaxed = true} : !llvm.ptr<3> -> i64 + %5 = nvvm.mbarrier.arrive_drop %barrier, %count {relaxed = true} : !llvm.ptr<3> -> i64 + %6 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3> -> i64 + %7 = nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<3> -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_drop_shared_cluster(%barrier: !llvm.ptr<7>, %count : i32) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_shared_cluster(ptr addrspace(7) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cluster(ptr addrspace(7) %0, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.arrive_drop %barrier : !llvm.ptr<7> + nvvm.mbarrier.arrive_drop %barrier, %count : !llvm.ptr<7> + nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<7> + nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope} : !llvm.ptr<7> + + nvvm.mbarrier.arrive_drop %barrier {relaxed = true} : !llvm.ptr<7> + nvvm.mbarrier.arrive_drop %barrier, %count {relaxed = true} : !llvm.ptr<7> + nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7> + nvvm.mbarrier.arrive_drop %barrier, %count {scope = #nvvm.mem_scope, relaxed = true} : !llvm.ptr<7> + llvm.return +} + +llvm.func @mbarrier_arrive_drop_nocomplete(%barrier: !llvm.ptr) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_nocomplete(ptr %0) { + // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(ptr %0, i32 %2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %count = nvvm.read.ptx.sreg.ntid.x : i32 + %0 = nvvm.mbarrier.arrive_drop.nocomplete %barrier, %count : !llvm.ptr, i32 -> i64 + llvm.return +} + +llvm.func @mbarrier_arrive_drop_nocomplete_shared(%barrier: !llvm.ptr<3>) { + // CHECK-LABEL: define void @mbarrier_arrive_drop_nocomplete_shared(ptr addrspace(3) %0) { + // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(ptr addrspace(3) %0, i32 %2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + %count = nvvm.read.ptx.sreg.ntid.x : i32 + %0 = nvvm.mbarrier.arrive_drop.nocomplete %barrier, %count : !llvm.ptr<3>, i32 -> i64 + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_complete_tx.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_complete_tx.mlir new file mode 100644 index 0000000000000..99289fa03b22c --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_complete_tx.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @mbarrier_complete_tx_shared(%barrier: !llvm.ptr<3>, %tx_count : i32) { + // CHECK-LABEL: define void @mbarrier_complete_tx_shared(ptr addrspace(3) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.complete.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.complete.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.complete.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.complete_tx %barrier, %tx_count : !llvm.ptr<3>, i32 + nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 + nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 + + llvm.return +} + +llvm.func @mbarrier_complete_tx_shared_cluster(%barrier: !llvm.ptr<7>, %tx_count : i32) { + // CHECK-LABEL: define void @mbarrier_complete_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.complete.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.complete.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.complete.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.complete_tx %barrier, %tx_count : !llvm.ptr<7>, i32 + nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_expect_tx.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_expect_tx.mlir new file mode 100644 index 0000000000000..dad7237e2f4cc --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_expect_tx.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @mbarrier_expect_tx_shared(%barrier: !llvm.ptr<3>, %tx_count : i32) { + // CHECK-LABEL: define void @mbarrier_expect_tx_shared(ptr addrspace(3) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.expect_tx %barrier, %tx_count : !llvm.ptr<3>, i32 + nvvm.mbarrier.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 + nvvm.mbarrier.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 + + llvm.return +} + +llvm.func @mbarrier_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %tx_count : i32) { + // CHECK-LABEL: define void @mbarrier_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) { + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: call void @llvm.nvvm.mbarrier.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.mbarrier.expect_tx %barrier, %tx_count : !llvm.ptr<7>, i32 + nvvm.mbarrier.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + nvvm.mbarrier.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_init.mlir similarity index 66% rename from mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir rename to mlir/test/Target/LLVMIR/nvvm/mbar_init.mlir index 9bb3b082777fd..ae9c7f29bc7a5 100644 --- a/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_init.mlir @@ -55,46 +55,6 @@ llvm.func @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) { llvm.return } -llvm.func @mbarrier_arrive(%barrier: !llvm.ptr) { - // CHECK-LABEL: define void @mbarrier_arrive(ptr %0) { - // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive(ptr %0) - // CHECK-NEXT: ret void - // CHECK-NEXT: } - %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr -> i64 - llvm.return -} - -llvm.func @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) { - // CHECK-LABEL: define void @mbarrier_arrive_shared(ptr addrspace(3) %0) { - // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive.shared(ptr addrspace(3) %0) - // CHECK-NEXT: ret void - // CHECK-NEXT: } - %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64 - llvm.return -} - -llvm.func @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) { - // CHECK-LABEL: define void @mbarrier_arrive_nocomplete(ptr %0) { - // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %0, i32 %2) - // CHECK-NEXT: ret void - // CHECK-NEXT: } - %count = nvvm.read.ptx.sreg.ntid.x : i32 - %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr, i32 -> i64 - llvm.return -} - -llvm.func @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) { - // CHECK-LABEL: define void @mbarrier_arrive_nocomplete_shared(ptr addrspace(3) %0) { - // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %0, i32 %2) - // CHECK-NEXT: ret void - // CHECK-NEXT: } - %count = nvvm.read.ptx.sreg.ntid.x : i32 - %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32 -> i64 - llvm.return -} - llvm.func @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 { // CHECK-LABEL: define i1 @mbarrier_test_wait(ptr %0, i64 %1) { // CHECK-NEXT: %3 = call i1 @llvm.nvvm.mbarrier.test.wait(ptr %0, i64 %1) diff --git a/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir new file mode 100644 index 0000000000000..4ad76248b7e25 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/mbar_invalid.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @mbarrier_arrive_ret_check(%barrier: !llvm.ptr<7>) { + // expected-error @below {{mbarrier in shared_cluster space cannot return any value}} + %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<7> -> i64 + llvm.return +} + +// ----- + +llvm.func @mbarrier_arrive_invalid_scope(%barrier: !llvm.ptr<7>) { + // expected-error @below {{mbarrier scope must be either CTA or Cluster}} + %0 = nvvm.mbarrier.arrive %barrier {scope = #nvvm.mem_scope} : !llvm.ptr<7> -> i64 + llvm.return +} + +// ----- + +llvm.func @mbarrier_arrive_drop_ret_check(%barrier: !llvm.ptr<7>) { + // expected-error @below {{mbarrier in shared_cluster space cannot return any value}} + %0 = nvvm.mbarrier.arrive_drop %barrier : !llvm.ptr<7> -> i64 + llvm.return +} + +// ----- + +llvm.func @mbarrier_arrive_drop_invalid_scope(%barrier: !llvm.ptr<7>) { + // expected-error @below {{mbarrier scope must be either CTA or Cluster}} + %0 = nvvm.mbarrier.arrive_drop %barrier {scope = #nvvm.mem_scope} : !llvm.ptr<7> -> i64 + llvm.return +} + +// ----- + +llvm.func @mbarrier_expect_tx_scope(%barrier: !llvm.ptr<7>, %tx_count: i32) { + // expected-error @below {{mbarrier scope must be either CTA or Cluster}} + nvvm.mbarrier.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<7>, i32 + llvm.return +} + +// ----- + +llvm.func @mbarrier_complete_tx_scope(%barrier: !llvm.ptr<3>, %tx_count: i32) { + // expected-error @below {{mbarrier scope must be either CTA or Cluster}} + nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope} : !llvm.ptr<3>, i32 + llvm.return +}