Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions flang/lib/Optimizer/Builder/IntrinsicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3333,8 +3333,7 @@ IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
assert(args.size() == 1);
mlir::Value barrier = convertPtrToNVVMSpace(
builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
barrier)
return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
.getResult();
}

Expand Down
2 changes: 1 addition & 1 deletion flang/test/Lower/CUDA/cuda-device-proc.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ end subroutine

! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64

! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
Expand Down
142 changes: 56 additions & 86 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -656,8 +656,8 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
}

def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_AnyPointer:$addr)> {
Results<(outs I64:$res)>,
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
let summary = "MBarrier Arrive Operation";
let description = [{
The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the
Expand All @@ -674,36 +674,32 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
value are implementation-specific.

The operation takes the following operand:
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
addressing, but the address must still be in the shared memory space.
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
must be a pointer to generic or shared::cta memory. When it is generic, the
underlying address must be within the shared::cta memory space; otherwise
the behavior is undefined.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
}];
string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr});
}];
let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
}

def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_PointerShared:$addr)> {
let summary = "Shared MBarrier Arrive Operation";
let description = [{
This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
should be accessed using a shared-memory pointer instead of a generic-memory pointer.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];

string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr});
auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
$res = createIntrinsicCall(builder, id, args);
}];
let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)";
}

def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
Results<(outs I64:$res)>,
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
I32:$count)> {
let summary = "MBarrier Arrive No-Complete Operation";
let description = [{
The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation
Expand All @@ -721,33 +717,29 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
captures the phase of the *mbarrier object* prior to the arrive-on operation.

The operation takes the following operands:
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
addressing, but the address must still be in the shared memory space.
- `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
must be a pointer to generic or shared::cta memory. When it is generic, the
underlying address must be within the shared::cta memory space; otherwise
the behavior is undefined.
- `count`: Integer specifying the count argument to the arrive-on operation.
Must be in the valid range as specified in the *mbarrier object* contents.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
}];
string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count});
}];
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
}

def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
let summary = "Shared MBarrier Arrive No-Complete Operation";
let description = [{
This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
should be accessed using a shared-memory pointer instead of a generic-memory pointer.
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];

string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count});
auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
$res = createIntrinsicCall(builder, id, args);
}];
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
}

def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
Expand Down Expand Up @@ -896,8 +888,9 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
}

def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> {
Results<(outs I1:$res)>,
Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
I64:$state)> {
let summary = "MBarrier Non-Blocking Test Wait Operation";
let description = [{
The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the
Expand Down Expand Up @@ -944,26 +937,20 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
}];
string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
}];
let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
}

def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
let summary = "Shared MBarrier Non-Blocking Test Wait Operation";
let description = [{
This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
should be accessed using a shared-memory pointer instead of a generic-memory pointer.
let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];

string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
$res = createIntrinsicCall(builder, id, args);
}];
let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1534,47 +1521,30 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
all prior cp.async operations initiated by the executing thread.
The `addr` operand specifies the address of the *mbarrier object*
in generic address space. The `noinc` attr impacts how the
mbarrier's state is updated.
in generic or shared::cta address space. When it is generic, the
underlying memory should fall within the shared::cta space;
otherwise the behavior is undefined. The `noinc` attr impacts
how the mbarrier's state is updated.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
}];
let assemblyFormat = "$addr attr-dict `:` type(operands)";

let arguments = (ins
LLVM_AnyPointer:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
DefaultValuedAttr<I1Attr, "0">:$noinc);

string llvmBuilder = [{
auto intId = $noinc ?
llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc :
llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;

createIntrinsicCall(builder, intId, {$addr});
}];
}

def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> {
let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared";
let description = [{
The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
track all prior cp.async operations initiated by the executing thread.
The `addr` operand specifies the address of the *mbarrier object* in
shared memory. The `noinc` attr impacts how the mbarrier's state
is updated.

[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
}];
let assemblyFormat = "$addr attr-dict `:` type(operands)";

let arguments = (ins
LLVM_PointerShared:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];

string llvmBuilder = [{
auto intId = $noinc ?
llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared :
llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared;

createIntrinsicCall(builder, intId, {$addr});
auto [id, args] = NVVM::CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
createIntrinsicCall(builder, id, args);
}];
}

Expand Down
26 changes: 5 additions & 21 deletions mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -865,13 +865,7 @@ struct NVGPUMBarrierArriveLowering
adaptor.getMbarId(), rewriter);
Type tokenType = getTypeConverter()->convertType(
nvgpu::MBarrierTokenType::get(op->getContext()));
if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
barrier);
} else {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
barrier);
}
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, barrier);
return success();
}
};
Expand All @@ -892,13 +886,8 @@ struct NVGPUMBarrierArriveNoCompleteLowering
Type tokenType = getTypeConverter()->convertType(
nvgpu::MBarrierTokenType::get(op->getContext()));
Value count = truncToI32(b, adaptor.getCount());
if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
op, tokenType, barrier, count);
} else {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
op, tokenType, barrier, count);
}
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
op, tokenType, barrier, count);
return success();
}
};
Expand All @@ -915,13 +904,8 @@ struct NVGPUMBarrierTestWaitLowering
getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
adaptor.getMbarId(), rewriter);
Type retType = rewriter.getI1Type();
if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
op, retType, barrier, adaptor.getToken());
} else {
rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
op, retType, barrier, adaptor.getToken());
}
rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(op, retType, barrier,
adaptor.getToken());
return success();
}
};
Expand Down
82 changes: 72 additions & 10 deletions mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1637,15 +1637,21 @@ std::string NVVM::MBarrierInitOp::getPtx() {
// getIntrinsicID/getIntrinsicIDAndArgs methods
//===----------------------------------------------------------------------===//

static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) {
auto ptrTy = llvm::cast<LLVM::LLVMPointerType>(ptr.getType());
return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS);
}

static bool isPtrInSharedCTASpace(mlir::Value ptr) {
return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared);
}

mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierInitOp>(op);
unsigned addressSpace =
llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
.getAddressSpace();
llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
? llvm::Intrinsic::nvvm_mbarrier_init_shared
: llvm::Intrinsic::nvvm_mbarrier_init;
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_init_shared
: llvm::Intrinsic::nvvm_mbarrier_init;

// Fill the Intrinsic Args
llvm::SmallVector<llvm::Value *> args;
Expand All @@ -1658,16 +1664,72 @@ mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
unsigned addressSpace =
llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
.getAddressSpace();
llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
llvm::Intrinsic::ID id = isShared
? llvm::Intrinsic::nvvm_mbarrier_inval_shared
: llvm::Intrinsic::nvvm_mbarrier_inval;

return {id, {mt.lookupValue(thisOp.getAddr())}};
}

mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveOp>(op);
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
llvm::Intrinsic::ID id = isShared
? llvm::Intrinsic::nvvm_mbarrier_arrive_shared
: llvm::Intrinsic::nvvm_mbarrier_arrive;

return {id, {mt.lookupValue(thisOp.getAddr())}};
}

mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
llvm::Intrinsic::ID id =
isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared
: llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete;
// Fill the Intrinsic Args
llvm::SmallVector<llvm::Value *> args;
args.push_back(mt.lookupValue(thisOp.getAddr()));
args.push_back(mt.lookupValue(thisOp.getCount()));

return {id, std::move(args)};
}

mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op);
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
llvm::Intrinsic::ID id = isShared
? llvm::Intrinsic::nvvm_mbarrier_test_wait_shared
: llvm::Intrinsic::nvvm_mbarrier_test_wait;
// Fill the Intrinsic Args
llvm::SmallVector<llvm::Value *> args;
args.push_back(mt.lookupValue(thisOp.getAddr()));
args.push_back(mt.lookupValue(thisOp.getState()));

return {id, std::move(args)};
}

mlir::NVVM::IDArgPair CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::CpAsyncMBarrierArriveOp>(op);
bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());

llvm::Intrinsic::ID id;
if (thisOp.getNoinc()) {
id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared
: llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc;
} else {
id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared
: llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
}

return {id, {mt.lookupValue(thisOp.getAddr())}};
}

#define CP_ASYNC_ID_IMPL(mod, size, suffix) \
llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix

Expand Down
Loading