-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[mlir][amdgpu] memory_counter_wait tensor counter support
#171153
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-amdgpu Author: Ivan Butygin (Hardcode84) ChangesFull diff: https://github.com/llvm/llvm-project/pull/171153.diff 7 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ba078f52d24f6..56160d3e8fe85 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -906,7 +906,8 @@ def AMDGPU_MemoryCounterWaitOp :
OptionalAttr<I32Attr>:$load,
OptionalAttr<I32Attr>:$store,
OptionalAttr<I32Attr>:$ds,
- OptionalAttr<I32Attr>:$exp
+ OptionalAttr<I32Attr>:$exp,
+ OptionalAttr<I32Attr>:$tensor
)>
{
let summary = "Wait for specified hardware counters";
@@ -919,7 +920,7 @@ def AMDGPU_MemoryCounterWaitOp :
counters into one.
}];
let assemblyFormat = [{
- oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
+ oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` | `tensor` `(` $tensor `)` ) attr-dict
}];
let hasCanonicalizer = 1;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index f3b0da0120998..7584b17075225 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -506,10 +506,16 @@ struct MemoryCounterWaitOpLowering
if (std::optional<int> exp = adaptor.getExp())
ROCDL::WaitExpcntOp::create(rewriter, loc, *exp);
+ if (std::optional<int> tensor = adaptor.getTensor())
+ ROCDL::WaitTensorcntOp::create(rewriter, loc, *tensor);
+
rewriter.eraseOp(op);
return success();
}
+ if (adaptor.getTensor())
+ return op.emitOpError("unsupported chipset");
+
auto getVal = [](Attribute attr) -> unsigned {
if (attr)
return cast<IntegerAttr>(attr).getInt();
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 4a85db3ecf6f8..b7a665b0f5367 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -614,10 +614,12 @@ struct FuseMemoryCounterWaitOp final : OpRewritePattern<MemoryCounterWaitOp> {
auto setters = {&MemoryCounterWaitOp::setLoad,
&MemoryCounterWaitOp::setStore, &MemoryCounterWaitOp::setDs,
- &MemoryCounterWaitOp::setExp};
- auto lhsVals = {op.getLoad(), op.getStore(), op.getDs(), op.getExp()};
+ &MemoryCounterWaitOp::setExp,
+ &MemoryCounterWaitOp::setTensor};
+ auto lhsVals = {op.getLoad(), op.getStore(), op.getDs(), op.getExp(),
+ op.getTensor()};
auto rhsVals = {next.getLoad(), next.getStore(), next.getDs(),
- next.getExp()};
+ next.getExp(), next.getTensor()};
rewriter.modifyOpInPlace(op, [&] {
for (auto [setter, lhs, rhs] :
llvm::zip_equal(setters, lhsVals, rhsVals)) {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
index 1016ee859e462..537ef59b503a6 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
// CHECK-LABEL: func @memory_counter_wait
func.func @memory_counter_wait() {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
new file mode 100644
index 0000000000000..5b29e01abebdb
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+
+// CHECK-LABEL: func @memory_counter_wait_tensor
+func.func @memory_counter_wait_tensor() {
+ // CHECK: rocdl.s.wait.tensorcnt 3
+ amdgpu.memory_counter_wait tensor(3)
+
+ return
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir
new file mode 100644
index 0000000000000..1d2f692bee488
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx942
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx1030
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx1100
+
+func.func @memory_counter_wait_tensor() {
+ // expected-error @below{{failed to legalize operation 'amdgpu.memory_counter_wait'}}
+ // expected-error @below{{'amdgpu.memory_counter_wait' op unsupported chipset}}
+ amdgpu.memory_counter_wait tensor(0)
+
+ return
+}
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index c66e9ed5d6f6d..cff1d3f2ac1fd 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -250,10 +250,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
// CHECK-LABEL fuse_memory_counter_wait
func.func @fuse_memory_counter_wait() {
// CHECK: amdgpu.memory_counter_wait
- // CHECK-SAME: load(1) store(2) ds(2) exp(1)
+ // CHECK-SAME: load(1) store(2) ds(2) exp(1) tensor(0)
// CHECK-NEXT: return
- amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
- amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1)
+ amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) tensor(5)
+ amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1) tensor(0)
return
}
|
|
@llvm/pr-subscribers-mlir-gpu Author: Ivan Butygin (Hardcode84) ChangesFull diff: https://github.com/llvm/llvm-project/pull/171153.diff 7 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ba078f52d24f6..56160d3e8fe85 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -906,7 +906,8 @@ def AMDGPU_MemoryCounterWaitOp :
OptionalAttr<I32Attr>:$load,
OptionalAttr<I32Attr>:$store,
OptionalAttr<I32Attr>:$ds,
- OptionalAttr<I32Attr>:$exp
+ OptionalAttr<I32Attr>:$exp,
+ OptionalAttr<I32Attr>:$tensor
)>
{
let summary = "Wait for specified hardware counters";
@@ -919,7 +920,7 @@ def AMDGPU_MemoryCounterWaitOp :
counters into one.
}];
let assemblyFormat = [{
- oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
+ oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` | `tensor` `(` $tensor `)` ) attr-dict
}];
let hasCanonicalizer = 1;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index f3b0da0120998..7584b17075225 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -506,10 +506,16 @@ struct MemoryCounterWaitOpLowering
if (std::optional<int> exp = adaptor.getExp())
ROCDL::WaitExpcntOp::create(rewriter, loc, *exp);
+ if (std::optional<int> tensor = adaptor.getTensor())
+ ROCDL::WaitTensorcntOp::create(rewriter, loc, *tensor);
+
rewriter.eraseOp(op);
return success();
}
+ if (adaptor.getTensor())
+ return op.emitOpError("unsupported chipset");
+
auto getVal = [](Attribute attr) -> unsigned {
if (attr)
return cast<IntegerAttr>(attr).getInt();
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 4a85db3ecf6f8..b7a665b0f5367 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -614,10 +614,12 @@ struct FuseMemoryCounterWaitOp final : OpRewritePattern<MemoryCounterWaitOp> {
auto setters = {&MemoryCounterWaitOp::setLoad,
&MemoryCounterWaitOp::setStore, &MemoryCounterWaitOp::setDs,
- &MemoryCounterWaitOp::setExp};
- auto lhsVals = {op.getLoad(), op.getStore(), op.getDs(), op.getExp()};
+ &MemoryCounterWaitOp::setExp,
+ &MemoryCounterWaitOp::setTensor};
+ auto lhsVals = {op.getLoad(), op.getStore(), op.getDs(), op.getExp(),
+ op.getTensor()};
auto rhsVals = {next.getLoad(), next.getStore(), next.getDs(),
- next.getExp()};
+ next.getExp(), next.getTensor()};
rewriter.modifyOpInPlace(op, [&] {
for (auto [setter, lhs, rhs] :
llvm::zip_equal(setters, lhsVals, rhsVals)) {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
index 1016ee859e462..537ef59b503a6 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
// CHECK-LABEL: func @memory_counter_wait
func.func @memory_counter_wait() {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
new file mode 100644
index 0000000000000..5b29e01abebdb
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+
+// CHECK-LABEL: func @memory_counter_wait_tensor
+func.func @memory_counter_wait_tensor() {
+ // CHECK: rocdl.s.wait.tensorcnt 3
+ amdgpu.memory_counter_wait tensor(3)
+
+ return
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir
new file mode 100644
index 0000000000000..1d2f692bee488
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx942
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx1030
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx1100
+
+func.func @memory_counter_wait_tensor() {
+ // expected-error @below{{failed to legalize operation 'amdgpu.memory_counter_wait'}}
+ // expected-error @below{{'amdgpu.memory_counter_wait' op unsupported chipset}}
+ amdgpu.memory_counter_wait tensor(0)
+
+ return
+}
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index c66e9ed5d6f6d..cff1d3f2ac1fd 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -250,10 +250,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
// CHECK-LABEL fuse_memory_counter_wait
func.func @fuse_memory_counter_wait() {
// CHECK: amdgpu.memory_counter_wait
- // CHECK-SAME: load(1) store(2) ds(2) exp(1)
+ // CHECK-SAME: load(1) store(2) ds(2) exp(1) tensor(0)
// CHECK-NEXT: return
- amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
- amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1)
+ amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) tensor(5)
+ amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1) tensor(0)
return
}
|
krzysz00
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Approved, with a note that the compiler team's working on a marker/wait API for these things that'll eliminate the need to do do our own counting
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/129/builds/34580 Here is the relevant piece of the build log for the reference |
No description provided.