Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 84 additions & 12 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -889,10 +889,7 @@ def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomple
}];
}

def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
Arguments<(ins
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
I32:$txcount, PtxPredicate:$predicate)> {
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx"> {
let summary = "MBarrier Arrive with Expected Transaction Count";
let description = [{
The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation
Expand All @@ -903,23 +900,98 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
threads within the CTA. When other threads perform corresponding acquire operations
(like 'mbarrier.test.wait'), they synchronize with this release pattern.

This operation first performs an expect-tx operation with the specified transaction
count, then performs an arrive-on operation with an implicit count of 1. The
expect-tx operation increases the tx-count of the *mbarrier object* by the specified
expectCount value, setting the current phase to expect and tracks the completion
of additional asynchronous transactions.
This operation first performs an expect-tx operation with the specified transaction
count, then performs an arrive-on operation with an implicit count of 1. The
expect-tx operation increases the expect-count of the *mbarrier object* by the
specified value (i.e. `txcount`), setting the current phase to expect and track
the completion of additional asynchronous transactions.

The operation takes the following operands:
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
addressing, but the address must still be in the shared memory space.
- `txcount`: An unsigned integer specifying the expected transaction count
for the expect-tx operation. This represents the number of asynchronous transactions
expected to complete before the barrier phase completes.
- `predicate`: Optional predicate for conditional execution.
- `scope`: This specifies the set of threads that directly observe the memory
synchronizing effect of the `mbarrier.test.wait` operation.
- `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics
and does not provide any ordering or visibility guarantees.
- `predicate`: Optional predicate for conditional execution used only when lowering to
inline-ptx.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
}];

let results = (outs Optional<I64>:$res);
let arguments = (ins
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
I32:$txcount,
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
DefaultValuedAttr<BoolAttr, "false">:$relaxed,
PtxPredicate:$predicate);

let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands) (`->` type($res)^)?";
let hasVerifier = 1;

let extraClassDeclaration = [{
bool hasIntrinsic() { return !getPredicate(); }

bool getAsmValues(RewriterBase &rewriter,
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues);

static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];

string llvmBuilder = [{
auto [id, args] = NVVM::MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);

int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
$res = createIntrinsicCall(builder, id, args);
else
createIntrinsicCall(builder, id, args);
}];
}

def NVVM_MBarrierArriveDropExpectTxOp : NVVM_Op<"mbarrier.arrive_drop.expect_tx"> {
let summary = "MBarrier arrive_drop with expected transaction count";
let description = [{
The `nvvm.mbarrier.arrive_drop.expect_tx` operation is similar to the
`nvvm.mbarrier.arrive.expect_tx` operation except that it performs an
`arrive_drop` operation instead of only an `arrive` operation.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
}];

let results = (outs Optional<I64>:$res);
let arguments = (ins
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
I32:$txcount,
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
DefaultValuedAttr<BoolAttr, "false">:$relaxed);

let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands) (`->` type($res)^)?";
let hasVerifier = 1;

let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];

string llvmBuilder = [{
auto [id, args] = NVVM::MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);

int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
$res = createIntrinsicCall(builder, id, args);
else
createIntrinsicCall(builder, id, args);
}];
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
}

def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,
Expand Down
6 changes: 5 additions & 1 deletion mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,11 @@ struct NVGPUMBarrierArriveExpectTxLowering
adaptor.getMbarId(), rewriter);
Value txcount = truncToI32(b, adaptor.getTxcount());
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxOp>(
op, barrier, txcount, adaptor.getPredicate());
op, Type{}, // return-value is optional and is void by default
barrier, txcount, // barrier and txcount
NVVM::MemScopeKind::CTA, // default scope is CTA
false, // relaxed-semantics is false
adaptor.getPredicate());
return success();
}
};
Expand Down
109 changes: 109 additions & 0 deletions mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,34 @@ LogicalResult MBarrierArriveDropOp::verify() {
getRes());
}

LogicalResult MBarrierArriveExpectTxOp::verify() {
// The inline-ptx version of this Op does not support all features.
// With predicate, this Op lowers to inline-ptx. So, verify and
// error-out if there are unsupported features.
if (getPredicate()) {
if (getScope() != NVVM::MemScopeKind::CTA)
return emitError("mbarrier scope must be CTA when using predicate");

if (isPtrInSharedClusterSpace(getAddr()))
return emitError("mbarrier in shared_cluster space is not supported when "
"using predicate");

if (getRes())
return emitError("return-value is not supported when using predicate");

if (getRelaxed() == true)
return emitError("mbarrier with relaxed semantics is not supported when "
"using predicate");
}
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
getRes());
}

LogicalResult MBarrierArriveDropExpectTxOp::verify() {
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
getRes());
}

LogicalResult MBarrierExpectTxOp::verify() {
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope());
}
Expand Down Expand Up @@ -2576,6 +2604,87 @@ mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs(
return {id, {mbar, count}};
}

bool MBarrierArriveExpectTxOp::getAsmValues(
RewriterBase &rewriter,
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>>
&asmValues) {
// Add all the operands but not the attrs to the asmValues list.
// The attrs here are used to generate the right variants for
// intrinsics-lowering. So, we ignore them while generating inline-PTX.
for (auto val : getOperands())
asmValues.push_back({val, mlir::NVVM::PTXRegisterMod::Read});

return false;
}

mlir::NVVM::IDArgPair MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveExpectTxOp>(op);

bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
// bit-0: Space
// bit-1: Scope
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);

// clang-format off
static constexpr llvm::Intrinsic::ID IDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cluster};
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cluster};
// clang-format on
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];

// Tidy-up the Intrinsic Args
llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount());
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
if (needCast)
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);

return {id, {mbar, txcount}};
}

mlir::NVVM::IDArgPair MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveDropExpectTxOp>(op);

bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
// bit-0: Space
// bit-1: Scope
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);

// clang-format off
static constexpr llvm::Intrinsic::ID IDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cluster};
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cluster};
// clang-format on
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];

// Tidy-up the Intrinsic Args
llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount());
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
if (needCast)
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);

return {id, {mbar, txcount}};
}

mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);
Expand Down
8 changes: 2 additions & 6 deletions mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,13 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou

// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1
llvm.return
}

// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic
llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) {
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1
llvm.return
Expand Down Expand Up @@ -544,8 +540,8 @@ func.func @elect_one_leader_sync() {

// -----

// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
// CHECK-LABEL: @test_nvvm_prefetch
llvm.func @test_nvvm_prefetch(%desc : !llvm.ptr, %pred : i1) {
//CHECK: nvvm.prefetch tensormap, %{{.*}}
nvvm.prefetch tensormap, %desc : !llvm.ptr
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
Expand Down
68 changes: 68 additions & 0 deletions mlir/test/Target/LLVMIR/nvvm/mbar_arr_drop_expect_tx.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

llvm.func @mbarrier_arrive_drop_expect_tx_generic(%barrier: !llvm.ptr, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_generic(ptr %0, i32 %1) {
// CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %3, i32 %1)
// CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1)
// CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %7, i32 %1)
// CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %9, i32 %1)
// CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 %1)
// CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %13, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
%0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i64
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr, i32 -> i64
%2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr, i32 -> i64

%3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr, i32 -> i64
%4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr, i32 -> i64
%5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr, i32 -> i64
llvm.return
}

llvm.func @mbarrier_arrive_drop_expect_tx_shared(%barrier: !llvm.ptr<3>, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared(ptr addrspace(3) %0, i32 %1) {
// CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
%0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 -> i64
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<3>, i32 -> i64
%2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<3>, i32 -> i64

%3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<3>, i32 -> i64
%4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
%5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
llvm.return
}

llvm.func @mbarrier_arrive_drop_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) {
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<7>, i32

nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<7>, i32
llvm.return
}

Loading