Skip to content

Commit d992856

Browse files
committed
[MLIR][NVVM] Update mbarrier.arrive.* Op
This patch updates the mbarrier.arrive.* family of Ops to include all features added up-to Blackwell. * Update the `mbarrier.arrive` Op to include shared_cluster memory space, cta/cluster scope and an option to lower using relaxed semantics. * An `arrive_drop` variant is added for both the `arrive` and `arrive.nocomplete` operations. * Verifier checks are added wherever appropriate. * lit tests are added to verify the lowering to the intrinsics. TODO: * Updates for the remaining mbarrier family will be done in subsequent PRs. (mainly, expect/complete-tx, arrive.expect-tx, and {test/try}waits. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
1 parent 0c84643 commit d992856

File tree

6 files changed

+477
-57
lines changed

6 files changed

+477
-57
lines changed

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -716,9 +716,7 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
716716
}];
717717
}
718718

719-
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
720-
Results<(outs I64:$res)>,
721-
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
719+
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive"> {
722720
let summary = "MBarrier Arrive Operation";
723721
let description = [{
724722
The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the
@@ -730,19 +728,40 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
730728
with this release pattern.
731729

732730
This operation causes the executing thread to signal its arrival at the barrier.
733-
The operation returns an opaque value that captures the phase of the
734-
*mbarrier object* prior to the arrive-on operation. The contents of this state
735-
value are implementation-specific.
736731

737-
The operation takes the following operand:
732+
- `res`: When the `space` is not shared_cluster, this operation returns an
733+
opaque 64-bit value capturing the phase of the *mbarrier object* prior to
734+
the arrive-on operation. The contents of this return value are
735+
implementation-specific. An *mbarrier object* located in the shared_cluster
736+
space cannot return a value.
737+
738+
The operation takes the following operands:
738739
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
739-
must be a pointer to generic or shared::cta memory. When it is generic, the
740-
underlying address must be within the shared::cta memory space; otherwise
741-
the behavior is undefined.
740+
must be a pointer to generic or shared_cta or shared_cluster memory. When it
741+
is generic, the underlying address must be within the shared_cta memory space;
742+
otherwise the behavior is undefined.
743+
- `count`: This specifies the amount by which the pending arrival count is
744+
decremented. If the `count` argument is not specified, the pending arrival
745+
count is decremented by 1.
746+
- `scope`: This specifies the set of threads that directly observe the memory
747+
synchronizing effect of the `mbarrier.arrive` operation.
748+
- `space`: This indicates the memory space where the mbarrier object resides.
749+
- `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics
750+
and does not provide any ordering or visibility guarantees.
742751

743752
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
744753
}];
745-
let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
754+
755+
let results = (outs Optional<I64>:$res);
756+
let arguments = (ins
757+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
758+
Optional<I32>:$count,
759+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
760+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
761+
762+
let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?";
763+
764+
let hasVerifier = 1;
746765

747766
let extraClassDeclaration = [{
748767
static mlir::NVVM::IDArgPair
@@ -753,7 +772,54 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
753772
string llvmBuilder = [{
754773
auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
755774
*op, moduleTranslation, builder);
756-
$res = createIntrinsicCall(builder, id, args);
775+
776+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
777+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
778+
$res = createIntrinsicCall(builder, id, args);
779+
else
780+
createIntrinsicCall(builder, id, args);
781+
}];
782+
}
783+
784+
def NVVM_MBarrierArriveDropOp : NVVM_Op<"mbarrier.arrive_drop"> {
785+
let summary = "MBarrier Arrive-Drop Operation";
786+
let description = [{
787+
The `nvvm.mbarrier.arrive_drop` operation decrements the expected arrival
788+
count of the *mbarrier object* by `count` and then performs an arrive-on
789+
operation. When `count` is not specified, it defaults to 1. The decrement
790+
of the expected arrival count applies to all the subsequent phases of the
791+
*mbarrier object*. The remaining semantics are identical to those of the
792+
`nvvm.mbarrier.arrive` operation.
793+
794+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
795+
}];
796+
797+
let results = (outs Optional<I64>:$res);
798+
let arguments = (ins
799+
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
800+
Optional<I32>:$count,
801+
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
802+
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
803+
804+
let assemblyFormat = "$addr (`,` $count^)? attr-dict `:` type($addr) (`->` type($res)^)?";
805+
806+
let hasVerifier = 1;
807+
808+
let extraClassDeclaration = [{
809+
static mlir::NVVM::IDArgPair
810+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
811+
llvm::IRBuilderBase& builder);
812+
}];
813+
814+
string llvmBuilder = [{
815+
auto [id, args] = NVVM::MBarrierArriveDropOp::getIntrinsicIDAndArgs(
816+
*op, moduleTranslation, builder);
817+
818+
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
819+
if (addrSpace != static_cast<unsigned>(NVVM::NVVMMemorySpace::SharedCluster))
820+
$res = createIntrinsicCall(builder, id, args);
821+
else
822+
createIntrinsicCall(builder, id, args);
757823
}];
758824
}
759825

@@ -803,6 +869,35 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
803869
}];
804870
}
805871

872+
def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomplete">,
873+
Results<(outs I64:$res)>,
874+
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
875+
I32:$count)> {
876+
let summary = "MBarrier Arrive-Drop No-Complete Operation";
877+
let description = [{
878+
The `nvvm.mbarrier.arrive_drop.nocomplete` operation decrements the expected
879+
arrival count of the *mbarrier object* by the amount `count` and then performs
880+
an arrive-on operation on the *mbarrier object* with the guarantee that it
881+
will not cause the barrier to complete its current phase.
882+
883+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
884+
}];
885+
886+
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
887+
888+
let extraClassDeclaration = [{
889+
static mlir::NVVM::IDArgPair
890+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
891+
llvm::IRBuilderBase& builder);
892+
}];
893+
894+
string llvmBuilder = [{
895+
auto [id, args] = NVVM::MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs(
896+
*op, moduleTranslation, builder);
897+
$res = createIntrinsicCall(builder, id, args);
898+
}];
899+
}
900+
806901
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
807902
Arguments<(ins
808903
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,26 @@ static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) {
5656
return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS);
5757
}
5858

59+
static bool isPtrInGenericSpace(mlir::Value ptr) {
60+
return isPtrInAddrSpace(ptr, NVVMMemorySpace::Generic);
61+
}
62+
5963
static bool isPtrInSharedCTASpace(mlir::Value ptr) {
6064
return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared);
6165
}
6266

67+
static bool isPtrInSharedClusterSpace(mlir::Value ptr) {
68+
return isPtrInAddrSpace(ptr, NVVMMemorySpace::SharedCluster);
69+
}
70+
71+
static llvm::Value *castPtrToAddrSpace(llvm::IRBuilderBase &builder,
72+
llvm::Value *ptr,
73+
NVVMMemorySpace targetAS) {
74+
unsigned AS = static_cast<unsigned>(targetAS);
75+
return builder.CreateAddrSpaceCast(
76+
ptr, llvm::PointerType::get(builder.getContext(), AS));
77+
}
78+
6379
//===----------------------------------------------------------------------===//
6480
// Verifier methods
6581
//===----------------------------------------------------------------------===//
@@ -220,6 +236,32 @@ LogicalResult CpAsyncBulkGlobalToSharedClusterOp::verify() {
220236
return success();
221237
}
222238

239+
static LogicalResult verifyMBarrierArriveLikeOp(Operation *op, Value addr,
240+
NVVM::MemScopeKind scope,
241+
Value retVal) {
242+
bool isSharedCluster = isPtrInSharedClusterSpace(addr);
243+
bool hasRetValue = static_cast<bool>(retVal);
244+
245+
if (scope != NVVM::MemScopeKind::CTA && scope != NVVM::MemScopeKind::CLUSTER)
246+
return op->emitError("mbarrier scope must be either CTA or Cluster");
247+
248+
if (isSharedCluster && hasRetValue)
249+
return op->emitError(
250+
"mbarrier in shared_cluster space cannot return any value");
251+
252+
return success();
253+
}
254+
255+
LogicalResult MBarrierArriveOp::verify() {
256+
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
257+
getRes());
258+
}
259+
260+
LogicalResult MBarrierArriveDropOp::verify() {
261+
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
262+
getRes());
263+
}
264+
223265
LogicalResult ConvertFloatToTF32Op::verify() {
224266
using RndMode = NVVM::FPRoundingMode;
225267
switch (getRnd()) {
@@ -1864,12 +1906,81 @@ mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
18641906
mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs(
18651907
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
18661908
auto thisOp = cast<NVVM::MBarrierArriveOp>(op);
1867-
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
1868-
llvm::Intrinsic::ID id = isShared
1869-
? llvm::Intrinsic::nvvm_mbarrier_arrive_shared
1870-
: llvm::Intrinsic::nvvm_mbarrier_arrive;
18711909

1872-
return {id, {mt.lookupValue(thisOp.getAddr())}};
1910+
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
1911+
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
1912+
// bit-0: Space
1913+
// bit-1: Scope
1914+
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
1915+
1916+
static constexpr llvm::Intrinsic::ID IDs[] = {
1917+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cta_space_cta,
1918+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cta_space_cluster,
1919+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cluster_space_cta,
1920+
llvm::Intrinsic::nvvm_mbarrier_arrive_scope_cluster_space_cluster};
1921+
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
1922+
llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cta_space_cta,
1923+
llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cta_space_cluster,
1924+
llvm::Intrinsic::nvvm_mbarrier_arrive_relaxed_scope_cluster_space_cta,
1925+
llvm::Intrinsic::
1926+
nvvm_mbarrier_arrive_relaxed_scope_cluster_space_cluster};
1927+
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
1928+
1929+
// Tidy-up the Intrinsic Args
1930+
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
1931+
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
1932+
if (needCast)
1933+
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
1934+
1935+
// When count is not explicitly specified, the default is 1.
1936+
llvm::LLVMContext &ctx = mt.getLLVMContext();
1937+
bool hasCount = static_cast<bool>(thisOp.getCount());
1938+
llvm::Value *count =
1939+
hasCount ? mt.lookupValue(thisOp.getCount())
1940+
: llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1);
1941+
1942+
return {id, {mbar, count}};
1943+
}
1944+
1945+
mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs(
1946+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
1947+
auto thisOp = cast<NVVM::MBarrierArriveDropOp>(op);
1948+
1949+
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
1950+
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
1951+
// bit-0: Space
1952+
// bit-1: Scope
1953+
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
1954+
1955+
static constexpr llvm::Intrinsic::ID IDs[] = {
1956+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cta_space_cta,
1957+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cta_space_cluster,
1958+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cluster_space_cta,
1959+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_scope_cluster_space_cluster};
1960+
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
1961+
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_relaxed_scope_cta_space_cta,
1962+
llvm::Intrinsic::
1963+
nvvm_mbarrier_arrive_drop_relaxed_scope_cta_space_cluster,
1964+
llvm::Intrinsic::
1965+
nvvm_mbarrier_arrive_drop_relaxed_scope_cluster_space_cta,
1966+
llvm::Intrinsic::
1967+
nvvm_mbarrier_arrive_drop_relaxed_scope_cluster_space_cluster};
1968+
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
1969+
1970+
// Tidy-up the Intrinsic Args
1971+
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
1972+
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
1973+
if (needCast)
1974+
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
1975+
1976+
// When count is not explicitly specified, the default is 1.
1977+
llvm::LLVMContext &ctx = mt.getLLVMContext();
1978+
bool hasCount = static_cast<bool>(thisOp.getCount());
1979+
llvm::Value *count =
1980+
hasCount ? mt.lookupValue(thisOp.getCount())
1981+
: llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1);
1982+
1983+
return {id, {mbar, count}};
18731984
}
18741985

18751986
mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
@@ -1887,6 +1998,21 @@ mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
18871998
return {id, std::move(args)};
18881999
}
18892000

2001+
mlir::NVVM::IDArgPair MBarrierArriveDropNocompleteOp::getIntrinsicIDAndArgs(
2002+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
2003+
auto thisOp = cast<NVVM::MBarrierArriveDropNocompleteOp>(op);
2004+
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
2005+
llvm::Intrinsic::ID id =
2006+
isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_drop_noComplete_shared
2007+
: llvm::Intrinsic::nvvm_mbarrier_arrive_drop_noComplete;
2008+
// Fill the Intrinsic Args
2009+
llvm::SmallVector<llvm::Value *> args;
2010+
args.push_back(mt.lookupValue(thisOp.getAddr()));
2011+
args.push_back(mt.lookupValue(thisOp.getCount()));
2012+
2013+
return {id, std::move(args)};
2014+
}
2015+
18902016
mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs(
18912017
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
18922018
auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op);

0 commit comments

Comments
 (0)