-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU] tensor_{load_to/store_from}_lds => ..._d2 simplification #171540
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This commit adds the rewrite
```
llvm.amdgcn.tensor.{load.to/store.from}.lds(
<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer,
<4 x i32> zeroinitializer, i32 [cachepolicy])
=>
llvm.amdgcn.tensor.{load.to/store.from}.lds.d2(
<4 x i32> %$d0, <8 x i32> %d1, i32 [cachepolicy])
```
This is justifed because, when the short encoding that uses the NULL
SGPR for registers 2 and 3 is used, the hardware acts as if those
registers were 0, including in the gather mode.
It is always safe not to run this transformation.
(Note: tests were LLM'd and then tweaked.)
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Krzysztof Drewniak (krzysz00) ChangesThis commit adds the rewrite This is justifed because, when the short encoding that uses the NULL SGPR for registers 2 and 3 is used, the hardware acts as if those registers were 0, including in the gather mode. It is always safe not to run this transformation. (Note: tests were LLM'd and then tweaked.) Full diff: https://github.com/llvm/llvm-project/pull/171540.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 47926734d64d4..d3525e1eca304 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1737,6 +1737,26 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ Value *D2 = II.getArgOperand(2);
+ Value *D3 = II.getArgOperand(3);
+ // We know that not passing the second and third tensor DMA groups is
+ // equivalent to passing zeroes for those registers, so we rewrite to the
+ // shorter form here.
+ if (!match(D2, m_Zero()) || !match(D3, m_Zero()))
+ return std::nullopt;
+
+ auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds
+ ? Intrinsic::amdgcn_tensor_load_to_lds_d2
+ : Intrinsic::amdgcn_tensor_store_from_lds_d2;
+ CallInst *NewII = IC.Builder.CreateIntrinsic(
+ ShortIntrinsic,
+ {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)}, &II);
+ NewII->takeName(&II);
+ NewII->copyMetadata(II);
+ return IC.eraseInstFromFunction(II);
+ }
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll b/llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll
new file mode 100644
index 0000000000000..e9cf704a8026e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; tensor_load_to_lds: D2 and D3 are zero -> convert to _d2 variant
+; --------------------------------------------------------------------
+
+define void @test_tensor_load_to_lds_d2_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1) {
+; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_zero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 0)
+ ret void
+}
+
+; --------------------------------------------------------------------
+; non-matching patterns for tensor_load_to_lds simplification
+; --------------------------------------------------------------------
+
+define void @test_tensor_load_to_lds_d2_zero_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d3) {
+; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_zero_d3_nonzero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D3:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> zeroinitializer, <4 x i32> [[D3]], i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> %d3, i32 0)
+ ret void
+}
+
+define void @test_tensor_load_to_lds_d2_nonzero_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2) {
+; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_nonzero_d3_zero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> zeroinitializer, i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> zeroinitializer, i32 0)
+ ret void
+}
+
+define void @test_tensor_load_to_lds_d2_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2, <4 x i32> inreg %d3) {
+; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_nonzero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]], <4 x i32> inreg [[D3:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> [[D3]], i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> %d3, i32 0)
+ ret void
+}
+
+; --------------------------------------------------------------------
+; tensor_store_from_lds: D2 and D3 are zero -> convert to _d2 variant
+; --------------------------------------------------------------------
+
+define void @test_tensor_store_from_lds_d2_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1) {
+; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_zero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 0)
+ ret void
+}
+
+; --------------------------------------------------------------------
+; non-matching patterns for tensor_store_from_lds simplification
+; --------------------------------------------------------------------
+
+define void @test_tensor_store_from_lds_d2_zero_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d3) {
+; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_zero_d3_nonzero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D3:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> zeroinitializer, <4 x i32> [[D3]], i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> %d3, i32 0)
+ ret void
+}
+
+define void @test_tensor_store_from_lds_d2_nonzero_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2) {
+; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_nonzero_d3_zero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> zeroinitializer, i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> zeroinitializer, i32 0)
+ ret void
+}
+
+define void @test_tensor_store_from_lds_d2_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2, <4 x i32> inreg %d3) {
+; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_nonzero(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]], <4 x i32> inreg [[D3:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> [[D3]], i32 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> %d3, i32 0)
+ ret void
+}
+
+; --------------------------------------------------------------------
+; ensure cachepolicy is preserved
+; --------------------------------------------------------------------
+
+define void @test_tensor_load_to_lds_d2_d3_zero_cachepolicy(<4 x i32> inreg %d0, <8 x i32> inreg %d1) {
+; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_zero_cachepolicy(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 1)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+ ret void
+}
+
+define void @test_tensor_store_from_lds_d2_d3_zero_cachepolicy(<4 x i32> inreg %d0, <8 x i32> inreg %d1) {
+; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_zero_cachepolicy(
+; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) {
+; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 1)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+ ret void
+}
+
+declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32>, <8 x i32>, <4 x i32>, <4 x i32>, i32 immarg)
+declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32>, <8 x i32>, <4 x i32>, <4 x i32>, i32 immarg)
|
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
| // We know that not passing the second and third tensor DMA groups is | ||
| // equivalent to passing zeroes for those registers, so we rewrite to the | ||
| // shorter form here. | ||
| if (!match(D2, m_Zero()) || !match(D3, m_Zero())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also do this for undef?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, we're now matching undef/poison too
| } | ||
|
|
||
| declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32>, <8 x i32>, <4 x i32>, <4 x i32>, i32 immarg) | ||
| declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32>, <8 x i32>, <4 x i32>, <4 x i32>, i32 immarg) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test poison case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done (but no undef test because we seem to not like those)
This commit adds the rewrite
This is justifed because, when the short encoding that uses the NULL SGPR for registers 2 and 3 is used, the hardware acts as if those registers were 0, including in the gather mode.
It is always safe not to run this transformation.
(Note: tests were LLM'd and then tweaked.)