-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[mlir][NVVM] Add support for few more fence Ops #170251
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][NVVM] Add support for few more fence Ops #170251
Conversation
|
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-llvm Author: Pradeep Kumar (schwarzschild-radius) ChangesThis commit adds support for the following fence Ops:
and removes fence.sc.cluster. The commit also moves memory.barrier into the Membar/Fence section Full diff: https://github.com/llvm/llvm-project/pull/170251.diff 7 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index b3395b7e0a24e..95bf5709030e2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1319,11 +1319,70 @@ def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait", [NVVMRequiresSM<90>]> {
let assemblyFormat = "attr-dict";
}
-def NVVM_FenceScClusterOp : NVVM_Op<"fence.sc.cluster"> {
- string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_fence_sc_cluster);
+//===----------------------------------------------------------------------===//
+// NVVM Member/Fence
+//===----------------------------------------------------------------------===//
+
+def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
+ Arguments<(ins MemScopeKindAttr:$scope)> {
+ let summary = "Memory barrier operation";
+ let description = [{
+ `membar` operation guarantees that prior memory accesses requested by this
+ thread are performed at the specified `scope`, before later memory
+ operations requested by this thread following the membar instruction.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, getMembarIntrinsicID($scope));
+ }];
+}
+
+def NVVM_FenceAcquireSyncRestrictOp : NVVM_Op<"fence.acquire.sync_restrict"> {
+ let summary = "Uni-directional thread fence operation with acquire semantics";
+ let description = [{
+ The `nvvm.fence.acquire.sync_restrict` Op restricts the class of memory
+ operations for which the fence instruction provides the memory ordering guarantees.
+ `sync_restrict` restricts `acquire` memory semantics to `shared_cluster` with cluster scope.
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
+
let assemblyFormat = "attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder,
+ llvm::Intrinsic::nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster);
+ }];
+}
+
+def NVVM_FenceReleaseSyncRestrictOp : NVVM_Op<"fence.release.sync_restrict"> {
+ let summary = "Uni-directional thread fence operation with release semantics";
+ let description = [{
+ The `nvvm.fence.release.sync_restrict` Op restricts the class of memory
+ operations for which the fence instruction provides the memory ordering guarantees.
+ `sync_restrict` restricts `release` memory semantics to `shared_cta` with cluster scope.
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder,
+ llvm::Intrinsic::nvvm_fence_release_sync_restrict_space_cta_scope_cluster);
+ }];
+}
+
+def NVVM_FenceMbarrierInitOp : NVVM_Op<"fence.mbarrier.init"> {
+ let description = [{
+ Fence operation that applies on the prior nvvm.mbarrier.init
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_fence_mbarrier_init_release_cluster);
+ }];
}
def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
@@ -1339,10 +1398,15 @@ def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
}
def ProxyKindAttr : EnumAttr<NVVM_Dialect, ProxyKind, "proxy_kind"> {
+ let description = [{
+ ProxyKind attribute represents a memory proxy which is an abstract label
+ applied to a method of memory access. When two memory operations use distinct
+ methods of memory access, they are said to be different proxies.
+ }];
let assemblyFormat = "`<` $value `>`";
}
-def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
+def NVVM_FenceProxyOp : NVVM_Op<"fence.proxy">,
Arguments<(ins ProxyKindAttr:$kind,
OptionalAttr<SharedSpaceAttr>:$space)> {
let description = [{
@@ -1353,16 +1417,11 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
}];
let assemblyFormat = "attr-dict";
- let extraClassDefinition = [{
- std::string $cppClass::getPtx() {
- std::string ptx = "fence.proxy.";
- ptx += stringifyProxyKind(getKind());
- if(getKind() == NVVM::ProxyKind::async_shared)
- { ptx += "::"; ptx += stringifySharedSpace(getSpace().value()); }
- ptx += ";";
- return ptx;
- }
+
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, getFenceProxyID($kind, $space));
}];
+
let hasVerifier = 1;
}
@@ -1399,23 +1458,6 @@ def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
let hasVerifier = 1;
}
-def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
- Arguments<(ins MemScopeKindAttr:$scope)> {
- let summary = "Memory barrier operation";
- let description = [{
- `membar` operation guarantees that prior memory accesses requested by this
- thread are performed at the specified `scope`, before later memory
- operations requested by this thread following the membar instruction.
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
- }];
-
- let assemblyFormat = "$scope attr-dict";
- let llvmBuilder = [{
- createIntrinsicCall(builder, getMembarIntrinsicID($scope), {});
- }];
-}
-
def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
Arguments<(ins MemScopeKindAttr:$scope,
DefaultValuedAttr<ProxyKindAttr,
@@ -1442,6 +1484,48 @@ def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
let hasVerifier = 1;
}
+def NVVM_FenceProxyAcquireSyncRestrictOp : NVVM_Op<"fence.proxy.acquire.sync_restrict">,
+ Arguments<(ins DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> {
+ let summary = "Uni-directional thread fence operation with acquire semantics";
+ let description = [{
+ The `nvvm.fence.proxy.acquire.sync_restrict` Op used to establish
+ ordering between a prior memory access performed between proxies. Currently,
+ the ordering is only supported between async and generic proxies. `sync_restrict`
+ restricts `acquire` memory semantics to `shared_cluster` with cluster scope.
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder,
+ llvm::Intrinsic::nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster);
+ }];
+
+ let hasVerifier = 1;
+}
+
+def NVVM_FenceProxyReleaseSyncRestrictOp : NVVM_Op<"fence.proxy.release.sync_restrict">,
+ Arguments<(ins DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> {
+ let summary = "Uni-directional thread fence operation with release semantics";
+ let description = [{
+ The `nvvm.fence.proxy.release.sync_restrict` Op used to establish
+ ordering between a prior memory access performed between proxies. Currently,
+ the ordering is only supported between async and generic proxies. `sync_restrict`
+ restricts `release` memory semantics to `shared_cta` with cluster scope.
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder,
+ llvm::Intrinsic::nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster);
+ }];
+
+ let hasVerifier = 1;
+}
+
def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>;
def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
@@ -1464,22 +1548,6 @@ def NVVM_SetMaxRegisterOp : NVVM_Op<"setmaxregister"> {
}];
}
-def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
- let arguments = (ins );
- let description = [{
- Fence operation that applies on the prior nvvm.mbarrier.init
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
- }];
-
- let assemblyFormat = "attr-dict";
- let extraClassDefinition = [{
- std::string $cppClass::getPtx() {
- return std::string("fence.mbarrier_init.release.cluster;");
- }
- }];
-}
-
def ShflKindBfly : I32EnumAttrCase<"bfly", 0>;
def ShflKindUp : I32EnumAttrCase<"up", 1>;
def ShflKindDown : I32EnumAttrCase<"down", 2>;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 413125245aca8..0954a82d12bea 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -2112,7 +2112,6 @@ LogicalResult NVVM::FenceProxyAcquireOp::verify() {
if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
return emitOpError("uni-directional proxies only support tensormap "
"for to_proxy attribute");
-
return success();
}
@@ -2124,7 +2123,26 @@ LogicalResult NVVM::FenceProxyReleaseOp::verify() {
if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
return emitOpError("uni-directional proxies only support tensormap "
"for to_proxy attribute");
+ return success();
+}
+
+LogicalResult NVVM::FenceProxyAcquireSyncRestrictOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::async)
+ return emitOpError("only async is supported for to_proxy attribute");
+ return success();
+}
+
+LogicalResult NVVM::FenceProxyReleaseSyncRestrictOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+ if (getToProxy() != NVVM::ProxyKind::async)
+ return emitOpError("only async is supported for to_proxy attribute");
return success();
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index cecff51e637a5..95d41d05658e6 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -411,6 +411,25 @@ getTcgen05StIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) {
llvm_unreachable("unhandled tcgen05.st lowering");
}
+static llvm::Intrinsic::ID getFenceProxyID(NVVM::ProxyKind kind,
+ std::optional<NVVM::SharedSpace> space) {
+ switch (kind) {
+ case NVVM::ProxyKind::alias:
+ return llvm::Intrinsic::nvvm_fence_proxy_alias;
+ case NVVM::ProxyKind::async:
+ return llvm::Intrinsic::nvvm_fence_proxy_async;
+ case NVVM::ProxyKind::async_global:
+ return llvm::Intrinsic::nvvm_fence_proxy_async_global;
+ case NVVM::ProxyKind::async_shared:
+ if (*space == NVVM::SharedSpace::shared_cta)
+ return llvm::Intrinsic::nvvm_fence_proxy_async_shared_cta;
+ else
+ return llvm::Intrinsic::nvvm_fence_proxy_async_shared_cluster;
+ default:
+ llvm_unreachable("unsupported proxy kind for fence.proxy Op");
+ }
+}
+
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index fbf8d9efb3bc7..442046b9f3302 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -584,13 +584,6 @@ func.func @cp_async_bulk_wait_group() {
// -----
-func.func @fence_mbarrier_init() {
- //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;"
- nvvm.fence.mbarrier.init
- func.return
-}
-// -----
-
func.func @fence_proxy() {
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", "" : () -> ()
nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index cd7bd37da5763..3bd9d9dd390e5 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -92,13 +92,6 @@ func.func @llvm_nvvm_cluster_wait() {
llvm.return
}
-// CHECK-LABEL: @llvm_nvvm_fence_sc_cluster
-func.func @llvm_nvvm_fence_sc_cluster() {
- // CHECK: nvvm.fence.sc.cluster
- nvvm.fence.sc.cluster
- llvm.return
-}
-
// CHECK-LABEL: @nvvm_shfl
func.func @nvvm_shfl(
%arg0 : i32, %arg1 : i32, %arg2 : i32,
diff --git a/mlir/test/Target/LLVMIR/nvvm/fence.mlir b/mlir/test/Target/LLVMIR/nvvm/fence.mlir
new file mode 100644
index 0000000000000..26ca922e178f2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/fence.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @nvvm_fence_sync_restrict
+llvm.func @nvvm_fence_sync_restrict() {
+ // CHECK: call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster()
+ nvvm.fence.acquire.sync_restrict
+ // CHECK: call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster()
+ nvvm.fence.release.sync_restrict
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy
+llvm.func @nvvm_fence_proxy() {
+ // CHECK: call void @llvm.nvvm.fence.proxy.alias()
+ nvvm.fence.proxy {kind = #nvvm.proxy_kind<alias>}
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.async()
+ nvvm.fence.proxy {kind = #nvvm.proxy_kind<async>}
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.async.global()
+ nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.global>}
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.async.shared_cta()
+ nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.async.shared_cluster()
+ nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy_sync_restrict
+llvm.func @nvvm_fence_proxy_sync_restrict() {
+ // CHECK: call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster()
+ nvvm.fence.proxy.acquire.sync_restrict
+ // CHECK: call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster()
+ nvvm.fence.proxy.release.sync_restrict
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
+llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+ nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+ nvvm.fence.proxy.release #nvvm.mem_scope<sys>
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
+llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
+ llvm.return
+}
+
+// CHECK-LABEL: @fence_mbarrier_init
+llvm.func @fence_mbarrier_init() {
+ // CHECK: call void @llvm.nvvm.fence.mbarrier_init.release.cluster()
+ nvvm.fence.mbarrier.init
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 5cba5c4fceefd..c4a69097692cb 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -699,42 +699,6 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant},
llvm.return
}
-
-// -----
-// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
-llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
- %c128 = llvm.mlir.constant(128) : i32
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
- nvvm.fence.proxy.release #nvvm.mem_scope<cta>
-
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
- nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
-
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
- nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
-
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
- nvvm.fence.proxy.release #nvvm.mem_scope<sys>
- llvm.return
-}
-
-// -----
-// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
-llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
- %c128 = llvm.mlir.constant(128) : i32
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
- nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
-
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
- nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
-
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
- nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
-
- // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
- nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
- llvm.return
-}
// -----
// CHECK-LABEL: @nvvm_exit
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
0c128f8 to
4ebde8c
Compare
| def NVVM_FenceProxyAcquireSyncRestrictOp : NVVM_Op<"fence.proxy.acquire.sync_restrict">, | ||
| Arguments<(ins DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy, | ||
| DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> { | ||
| let summary = "Uni-directional thread fence operation with acquire semantics"; | ||
| let description = [{ | ||
| The `nvvm.fence.proxy.acquire.sync_restrict` Op used to establish | ||
| ordering between a prior memory access performed between proxies. Currently, | ||
| the ordering is only supported between async and generic proxies. `sync_restrict` | ||
| restricts `acquire` memory semantics to `shared_cluster` with cluster scope. | ||
| [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) | ||
| }]; | ||
|
|
||
| let assemblyFormat = "attr-dict"; | ||
| let llvmBuilder = [{ | ||
| createIntrinsicCall(builder, | ||
| llvm::Intrinsic::nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster); | ||
| }]; | ||
|
|
||
| let hasVerifier = 1; | ||
| } | ||
|
|
||
| def NVVM_FenceProxyReleaseSyncRestrictOp : NVVM_Op<"fence.proxy.release.sync_restrict">, | ||
| Arguments<(ins DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy, | ||
| DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> { | ||
| let summary = "Uni-directional thread fence operation with release semantics"; | ||
| let description = [{ | ||
| The `nvvm.fence.proxy.release.sync_restrict` Op used to establish | ||
| ordering between a prior memory access performed between proxies. Currently, | ||
| the ordering is only supported between async and generic proxies. `sync_restrict` | ||
| restricts `release` memory semantics to `shared_cta` with cluster scope. | ||
| [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) | ||
| }]; | ||
|
|
||
| let assemblyFormat = "attr-dict"; | ||
| let llvmBuilder = [{ | ||
| createIntrinsicCall(builder, | ||
| llvm::Intrinsic::nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster); | ||
| }]; | ||
|
|
||
| let hasVerifier = 1; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we combine the ops?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Combined both the Ops into in the latest revision
bb4dfb9 to
57c60d6
Compare
mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
Outdated
Show resolved
Hide resolved
mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
Outdated
Show resolved
Hide resolved
57c60d6 to
570f3b8
Compare
|
|
||
| // ----- | ||
|
|
||
| llvm.func @fence_proxy_sync_restrict() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I agree the split-line helps us to have same names for all these functions, but it would be better to add a suffix (or prefix) that tells us what are we testing.
durga4github
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for addressing the comments.
The latest revision LGTM
This commit adds support for the following fence Ops: - fence.sync_restrict - fence.proxy.sync_restrict The commit also moves memory.barrier into the Membar/Fence section and migrates fence.mbarrier.init to intrinsics
570f3b8 to
d839dae
Compare
This commit adds support for the following fence Ops: - fence.sync_restrict - fence.proxy.sync_restrict The commit also moves memory.barrier into the Membar/Fence section, migrates fence.mbarrier.init to intrinsics and consolidates fence related tests under nvvm/fence.mlir and nvvm/fence-invalid.mlir
This commit adds support for the following fence Ops: - fence.sync_restrict - fence.proxy.sync_restrict The commit also moves memory.barrier into the Membar/Fence section, migrates fence.mbarrier.init to intrinsics and consolidates fence related tests under nvvm/fence.mlir and nvvm/fence-invalid.mlir
This commit adds support for the following fence Ops:
The commit also moves memory.barrier into the Membar/Fence section, migrates fence.mbarrier.init to intrinsics and consolidates fence related tests under nvvm/fence.mlir and nvvm/fence-invalid.mlir