Skip to content

Conversation

@LU-JOHN
Copy link
Contributor

@LU-JOHN LU-JOHN commented Oct 23, 2025

Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0.

Signed-off-by: John Lu <John.Lu@amd.com>
@llvmbot
Copy link
Member

llvmbot commented Oct 23, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: None (LU-JOHN)

Changes

Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0.


Full diff: https://github.com/llvm/llvm-project/pull/164835.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+5-3)
  • (added) llvm/test/CodeGen/AMDGPU/absdiff.ll (+12)
  • (modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (+20-5)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..220318cb6bd40 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -838,9 +838,11 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
   let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
 }
 
-let Defs = [SCC] in {
-def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
-} // End Defs = [SCC]
+let isCommutable = 1, Defs = [SCC] in {
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32",
+  [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))]
+>;
+} // End isCommutable = 1, Defs = [SCC]
 
 let SubtargetPredicate = isGFX8GFX9 in {
   def S_RFE_RESTORE_B64 : SOP2_Pseudo <
diff --git a/llvm/test/CodeGen/AMDGPU/absdiff.ll b/llvm/test/CodeGen/AMDGPU/absdiff.ll
new file mode 100644
index 0000000000000..3ce58f55ff0b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/absdiff.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+
+define amdgpu_ps i32 @absdiff_v1(i32 inreg %arg, i32 inreg %arg2) {
+; CHECK-LABEL: absdiff_v1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT:    ; return to shader part epilog
+  %diff = sub i32 %arg, %arg2
+  %res = call i32 @llvm.abs.i32(i32 %diff, i1 false)
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838b4a206..0166d7ac7ddc2 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -110,6 +110,21 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
   ret i32 %zext
 }
 
+define amdgpu_ps i32 @absdiff32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: absdiff32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_absdiff_i32 s0, s0, s1
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %diff = sub i32 %val0, %val1
+  %result = call i32 @llvm.abs.i32(i32 %diff, i1 false)
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
 define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: and32:
 ; CHECK:       ; %bb.0:
@@ -608,14 +623,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
 ; CHECK-NEXT:    s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT:    s_cbranch_scc0 .LBB36_2
 ; CHECK-NEXT:  ; %bb.1: ; %endif
 ; CHECK-NEXT:    s_mov_b32 s0, 1
-; CHECK-NEXT:    s_branch .LBB35_3
-; CHECK-NEXT:  .LBB35_2: ; %if
+; CHECK-NEXT:    s_branch .LBB36_3
+; CHECK-NEXT:  .LBB36_2: ; %if
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_branch .LBB35_3
-; CHECK-NEXT:  .LBB35_3:
+; CHECK-NEXT:    s_branch .LBB36_3
+; CHECK-NEXT:  .LBB36_3:
   %cmp = icmp ne ptr addrspace(4) @1, null
   br i1 %cmp, label %endif, label %if
 

Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Comment on lines +117 to +119
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Codegen is terrible here, but that's not your fault. Should be:

Suggested change
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_cselect_b32 s0, 1, 0

Signed-off-by: John Lu <John.Lu@amd.com>
%diff = sub i32 %arg, %arg2
%res = call i32 @llvm.abs.i32(i32 %diff, i1 false)
ret i32 %res
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Negative test for the multi use case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added negative test for multi-use case.

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s

define amdgpu_ps i32 @absdiff_v1(i32 inreg %arg, i32 inreg %arg2) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test vectors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested <2 x i32> and <4 x i32>

; CHECK-NEXT: s_absdiff_i32 s0, s0, s1
; CHECK-NEXT: ; return to shader part epilog
%diff = sub i32 %arg, %arg2
%res = call i32 @llvm.abs.i32(i32 %diff, i1 false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test with true?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added test with true variant.

@@ -0,0 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test 16-bit promoted case

Copy link
Contributor Author

@LU-JOHN LU-JOHN Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added 16-bit test case.

Signed-off-by: John Lu <John.Lu@amd.com>
@LU-JOHN LU-JOHN requested a review from arsenm October 24, 2025 22:11
Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@LU-JOHN LU-JOHN merged commit 7d14733 into llvm:main Oct 27, 2025
10 checks passed
dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to
test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
Lukacma pushed a commit to Lukacma/llvm-project that referenced this pull request Oct 29, 2025
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to
test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
aokblast pushed a commit to aokblast/llvm-project that referenced this pull request Oct 30, 2025
Generate s_absdiff_i32. Tested in absdiff.ll. Also update s_cmp_0.ll to
test that s_absdiff_i32 is foldable with a s_cmp_lg_u32 sX, 0.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants