Skip to content

Commit 2d3e3b5

Browse files
committed
[MLIR][NVVM] Update mbarrier.arrive.* Op
This patch updates the mbarrier.arrive.* family of Ops to include all features added up-to Blackwell. * Update the `mbarrier.arrive` Op to include shared_cluster memory space, cta/cluster scope and an option to lower using relaxed semantics. * An `arrive_drop` variant is added for both the `arrive` and `arrive.nocomplete` operations. * Verifier checks are added wherever appropriate. * lit tests are added to verify the lowering to the intrinsics. TODO: * Updates for the remaining mbarrier family will be done in subsequent PRs. (mainly, expect/complete-tx, arrive.expect-tx, and {test/try}waits. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
1 parent d2c7c60 commit 2d3e3b5

File tree

6 files changed

+477
-57
lines changed

6 files changed

+477
-57
lines changed

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -638,9 +638,7 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
638638
}];
639639
}
640640

641-
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
642-
Results<(outs I64:$res)>,
643-
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
641+
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive"> {
644642
let summary = "MBarrier Arrive Operation";
645643
let description = [{
646644
The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the
@@ -652,19 +650,40 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
652650
with this release pattern.
653651

654652
This operation causes the executing thread to signal its arrival at the barrier.
655-
The operation returns an opaque value that captures the phase of the
656-
*mbarrier object* prior to the arrive-on operation. The contents of this state
657-
value are implementation-specific.
658653

659-
The operation takes the following operand:
654+
- `res`: When the `space` is not shared_cluster, this operation returns an
655+
opaque 64-bit value capturing the phase of the *mbarrier object* prior to
656+
the arrive-on operation. The contents of this return value are
657+
implementation-specific. An *mbarrier object* located in the shared_cluster
658+
space cannot return a value.
659+
660+
The operation takes the following operands:
660661
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
661-
must be a pointer to generic or shared::cta memory. When it is generic, the
662-
underlying address must be within the shared::cta memory space; otherwise
663-
the behavior is undefined.
662+
must be a pointer to generic or shared_cta or shared_cluster memory. When it
663+
is generic, the underlying address must be within the shared_cta memory space;
664+
otherwise the behavior is undefined.
665+
- `count`: This specifies the amount by which the pending arrival count is
666+
decremented. If the `count` argument is not specified, the pending arrival
667+
count is decremented by 1.
668+
- `scope`: This specifies the set of threads that directly observe the memory
669+
synchronizing effect of the `mbarrier.arrive` operation.
670+
- `space`: This indicates the memory space where the mbarrier object resides.
671+
- `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics
672+
and does not provide any ordering or visibility guarantees.
664673

665674
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
666675
}];
667-
let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
676+
677+
let results = (outs Optional<I64>:$res);
678+
let arguments = (ins
679+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
680+
Optional<I32>:$count,
681+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
682+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
683+
684+
let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?";
685+
686+
let hasVerifier = 1;
668687

669688
let extraClassDeclaration = [{
670689
static mlir::NVVM::IDArgPair
@@ -675,7 +694,54 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
675694
string llvmBuilder = [{
676695
auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
677696
*op, moduleTranslation, builder);
678-
$res = createIntrinsicCall(builder, id, args);
697+
698+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
699+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
700+
$res = createIntrinsicCall(builder, id, args);
701+
else
702+
createIntrinsicCall(builder, id, args);
703+
}];
704+
}
705+
706+
def NVVM_MBarrierArriveDropOp : NVVM_Op<"mbarrier.arrive_drop"> {
707+
let summary = "MBarrier Arrive-Drop Operation";
708+
let description = [{
709+
The `nvvm.mbarrier.arrive_drop` operation decrements the expected arrival
710+
count of the *mbarrier object* by `count` and then performs an arrive-on
711+
operation. When `count` is not specified, it defaults to 1. The decrement
712+
of the expected arrival count applies to all the subsequent phases of the
713+
*mbarrier object*. The remaining semantics are identical to those of the
714+
`nvvm.mbarrier.arrive` operation.
715+
716+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
717+
}];
718+
719+
let results = (outs Optional<I64>:$res);
720+
let arguments = (ins
721+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
722+
Optional<I32>:$count,
723+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
724+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
725+
726+
let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?";
727+
728+
let hasVerifier = 1;
729+
730+
let extraClassDeclaration = [{
731+
static mlir::NVVM::IDArgPair
732+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
733+
llvm::IRBuilderBase& builder);
734+
}];
735+
736+
string llvmBuilder = [{
737+
auto [id, args] = NVVM::MBarrierArriveDropOp::getIntrinsicIDAndArgs(
738+
*op, moduleTranslation, builder);
739+
740+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
741+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
742+
$res = createIntrinsicCall(builder, id, args);
743+
else
744+
createIntrinsicCall(builder, id, args);
679745
}];
680746
}
681747

@@ -725,6 +791,35 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
725791
}];
726792
}
727793

794+
def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomplete">,
795+
Results<(outs I64:$res)>,
796+
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
797+
I32:$count)> {
798+
let summary = "MBarrier Arrive-Drop No-Complete Operation";
799+
let description = [{
800+
The `nvvm.mbarrier.arrive_drop.nocomplete` operation decrements the expected
801+
arrival count of the *mbarrier object* by the amount `count` and then performs
802+
an arrive-on operation on the *mbarrier object* with the guarantee that it
803+
will not cause the barrier to complete its current phase.
804+
805+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
806+
}];
807+
808+
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
809+
810+
let extraClassDeclaration = [{
811+
static mlir::NVVM::IDArgPair
812+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
813+
llvm::IRBuilderBase& builder);
814+
}];
815+
816+
string llvmBuilder = [{
817+
auto [id, args] = NVVM::MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs(
818+
*op, moduleTranslation, builder);
819+
$res = createIntrinsicCall(builder, id, args);
820+
}];
821+
}
822+
728823
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
729824
Arguments<(ins
730825
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,26 @@ static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) {
5757
return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS);
5858
}
5959

60+
static bool isPtrInGenericSpace(mlir::Value ptr) {
61+
return isPtrInAddrSpace(ptr, NVVMMemorySpace::Generic);
62+
}
63+
6064
static bool isPtrInSharedCTASpace(mlir::Value ptr) {
6165
return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared);
6266
}
6367

68+
static bool isPtrInSharedClusterSpace(mlir::Value ptr) {
69+
return isPtrInAddrSpace(ptr, NVVMMemorySpace::SharedCluster);
70+
}
71+
72+
static llvm::Value *castPtrToAddrSpace(llvm::IRBuilderBase &builder,
73+
llvm::Value *ptr,
74+
NVVMMemorySpace targetAS) {
75+
unsigned AS = static_cast<unsigned>(targetAS);
76+
return builder.CreateAddrSpaceCast(
77+
ptr, llvm::PointerType::get(builder.getContext(), AS));
78+
}
79+
6480
// Helper method to convert CtaGroupKind in NVVM Dialect to CtaGroupKind in LLVM
6581
static llvm::nvvm::CTAGroupKind
6682
getNVVMCtaGroupKind(NVVM::CTAGroupKind ctaGroup) {
@@ -233,6 +249,32 @@ LogicalResult CpAsyncBulkGlobalToSharedClusterOp::verify() {
233249
return success();
234250
}
235251

252+
static LogicalResult verifyMBarrierArriveLikeOp(Operation *op, Value addr,
253+
NVVM::MemScopeKind scope,
254+
Value retVal) {
255+
bool isSharedCluster = isPtrInSharedClusterSpace(addr);
256+
bool hasRetValue = static_cast<bool>(retVal);
257+
258+
if (scope != NVVM::MemScopeKind::CTA && scope != NVVM::MemScopeKind::CLUSTER)
259+
return op->emitError("mbarrier scope must be either CTA or Cluster");
260+
261+
if (isSharedCluster && hasRetValue)
262+
return op->emitError(
263+
"mbarrier in shared_cluster space cannot return any value");
264+
265+
return success();
266+
}
267+
268+
LogicalResult MBarrierArriveOp::verify() {
269+
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
270+
getRes());
271+
}
272+
273+
LogicalResult MBarrierArriveDropOp::verify() {
274+
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
275+
getRes());
276+
}
277+
236278
LogicalResult ConvertFloatToTF32Op::verify() {
237279
using RndMode = NVVM::FPRoundingMode;
238280
switch (getRnd()) {
@@ -1877,12 +1919,81 @@ mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
18771919
mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs(
18781920
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
18791921
auto thisOp = cast<NVVM::MBarrierArriveOp>(op);
1880-
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
1881-
llvm::Intrinsic::ID id = isShared
1882-
? llvm::Intrinsic::nvvm_mbarrier_arrive_shared
1883-
: llvm::Intrinsic::nvvm_mbarrier_arrive;
18841922

1885-
return {id, {mt.lookupValue(thisOp.getAddr())}};
1923+
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
1924+
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
1925+
// bit-0: Space
1926+
// bit-1: Scope
1927+
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
1928+
1929+
static constexpr llvm::Intrinsic::ID IDs[] = {
1930+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cta_space_cta,
1931+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cta_space_cluster,
1932+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cluster_space_cta,
1933+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cluster_space_cluster};
1934+
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
1935+
llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cta_space_cta,
1936+
llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cta_space_cluster,
1937+
llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cluster_space_cta,
1938+
llvm::Intrinsic::
1939+
nvvm_mbarrier_arrive_relaxed_scope_cluster_space_cluster};
1940+
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
1941+
1942+
// Tidy-up the Intrinsic Args
1943+
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
1944+
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
1945+
if (needCast)
1946+
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
1947+
1948+
// When count is not explicitly specified, the default is 1.
1949+
llvm::LLVMContext &ctx = mt.getLLVMContext();
1950+
bool hasCount = static_cast<bool>(thisOp.getCount());
1951+
llvm::Value *count =
1952+
hasCount ? mt.lookupValue(thisOp.getCount())
1953+
: llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1);
1954+
1955+
return {id, {mbar, count}};
1956+
}
1957+
1958+
mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs(
1959+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
1960+
auto thisOp = cast<NVVM::MBarrierArriveDropOp>(op);
1961+
1962+
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
1963+
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
1964+
// bit-0: Space
1965+
// bit-1: Scope
1966+
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
1967+
1968+
static constexpr llvm::Intrinsic::ID IDs[] = {
1969+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cta_space_cta,
1970+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cta_space_cluster,
1971+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cluster_space_cta,
1972+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cluster_space_cluster};
1973+
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
1974+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_relaxed_scope_cta_space_cta,
1975+
llvm::Intrinsic::
1976+
nvvm_mbarrier_arrive_drop_relaxed_scope_cta_space_cluster,
1977+
llvm::Intrinsic::
1978+
nvvm_mbarrier_arrive_drop_relaxed_scope_cluster_space_cta,
1979+
llvm::Intrinsic::
1980+
nvvm_mbarrier_arrive_drop_relaxed_scope_cluster_space_cluster};
1981+
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
1982+
1983+
// Tidy-up the Intrinsic Args
1984+
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
1985+
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
1986+
if (needCast)
1987+
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
1988+
1989+
// When count is not explicitly specified, the default is 1.
1990+
llvm::LLVMContext &ctx = mt.getLLVMContext();
1991+
bool hasCount = static_cast<bool>(thisOp.getCount());
1992+
llvm::Value *count =
1993+
hasCount ? mt.lookupValue(thisOp.getCount())
1994+
: llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1);
1995+
1996+
return {id, {mbar, count}};
18861997
}
18871998

18881999
mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
@@ -1900,6 +2011,21 @@ mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
19002011
return {id, std::move(args)};
19012012
}
19022013

2014+
mlir::NVVM::IDArgPair MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs(
2015+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
2016+
auto thisOp = cast<NVVM::MBarrierArriveDropNocompleteOp>(op);
2017+
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
2018+
llvm::Intrinsic::ID id =
2019+
isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_drop_noComplete_shared
2020+
: llvm::Intrinsic::nvvm_mbarrier_arrive_drop_noComplete;
2021+
// Fill the Intrinsic Args
2022+
llvm::SmallVector<llvm::Value *> args;
2023+
args.push_back(mt.lookupValue(thisOp.getAddr()));
2024+
args.push_back(mt.lookupValue(thisOp.getCount()));
2025+
2026+
return {id, std::move(args)};
2027+
}
2028+
19032029
mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs(
19042030
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
19052031
auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op);

0 commit comments

Comments
 (0)