-
Notifications
You must be signed in to change notification settings - Fork 15k
[AMDGPU] Generate s_absdiff_i32 #164835
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Generate s_absdiff_i32 #164835
Conversation
Signed-off-by: John Lu <John.Lu@amd.com>
|
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesGenerate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0. Full diff: https://github.com/llvm/llvm-project/pull/164835.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..220318cb6bd40 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -838,9 +838,11 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
}
-let Defs = [SCC] in {
-def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
-} // End Defs = [SCC]
+let isCommutable = 1, Defs = [SCC] in {
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32",
+ [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))]
+>;
+} // End isCommutable = 1, Defs = [SCC]
let SubtargetPredicate = isGFX8GFX9 in {
def S_RFE_RESTORE_B64 : SOP2_Pseudo <
diff --git a/llvm/test/CodeGen/AMDGPU/absdiff.ll b/llvm/test/CodeGen/AMDGPU/absdiff.ll
new file mode 100644
index 0000000000000..3ce58f55ff0b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/absdiff.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+
+define amdgpu_ps i32 @absdiff_v1(i32 inreg %arg, i32 inreg %arg2) {
+; CHECK-LABEL: absdiff_v1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i32 %arg, %arg2
+ %res = call i32 @llvm.abs.i32(i32 %diff, i1 false)
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838b4a206..0166d7ac7ddc2 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -110,6 +110,21 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
ret i32 %zext
}
+define amdgpu_ps i32 @absdiff32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: absdiff32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %diff = sub i32 %val0, %val1
+ %result = call i32 @llvm.abs.i32(i32 %diff, i1 false)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
@@ -608,14 +623,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT: s_cbranch_scc0 .LBB36_2
; CHECK-NEXT: ; %bb.1: ; %endif
; CHECK-NEXT: s_mov_b32 s0, 1
-; CHECK-NEXT: s_branch .LBB35_3
-; CHECK-NEXT: .LBB35_2: ; %if
+; CHECK-NEXT: s_branch .LBB36_3
+; CHECK-NEXT: .LBB36_2: ; %if
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_branch .LBB35_3
-; CHECK-NEXT: .LBB35_3:
+; CHECK-NEXT: s_branch .LBB36_3
+; CHECK-NEXT: .LBB36_3:
%cmp = icmp ne ptr addrspace(4) @1, null
br i1 %cmp, label %endif, label %if
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
| ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Codegen is terrible here, but that's not your fault. Should be:
| ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 | |
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] | |
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 | |
| ; CHECK-NEXT: s_cselect_b32 s0, 1, 0 |
Signed-off-by: John Lu <John.Lu@amd.com>
| %diff = sub i32 %arg, %arg2 | ||
| %res = call i32 @llvm.abs.i32(i32 %diff, i1 false) | ||
| ret i32 %res | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Negative test for the multi use case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added negative test for multi-use case.
llvm/test/CodeGen/AMDGPU/absdiff.ll
Outdated
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
| ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s | ||
|
|
||
| define amdgpu_ps i32 @absdiff_v1(i32 inreg %arg, i32 inreg %arg2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test vectors
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tested <2 x i32> and <4 x i32>
llvm/test/CodeGen/AMDGPU/absdiff.ll
Outdated
| ; CHECK-NEXT: s_absdiff_i32 s0, s0, s1 | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %diff = sub i32 %arg, %arg2 | ||
| %res = call i32 @llvm.abs.i32(i32 %diff, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also test with true?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added test with true variant.
| @@ -0,0 +1,12 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
| ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s | |||
|
|
|||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also test 16-bit promoted case
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added 16-bit test case.
Signed-off-by: John Lu <John.Lu@amd.com>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0. --------- Signed-off-by: John Lu <John.Lu@amd.com>
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0. --------- Signed-off-by: John Lu <John.Lu@amd.com>
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0. --------- Signed-off-by: John Lu <John.Lu@amd.com>
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0.