Skip to content

Conversation

@arsenm
Copy link
Contributor

@arsenm arsenm commented Dec 16, 2025

No description provided.

Copy link
Contributor Author

arsenm commented Dec 16, 2025

@llvmbot
Copy link
Member

llvmbot commented Dec 16, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/172552.diff

1 Files Affected:

  • (added) llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll (+319)
diff --git a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
new file mode 100644
index 0000000000000..038252e4cb1e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; The fast fdiv path for 2.5 ulp fdiv with daz emits a fabs on the
+; RHS, which should be droppable if we know the sign bit of the RHS is
+; already 0 (or nan).
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_fabs(float %x, float %y) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v1, |v1|, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %div = fdiv float %x, %fabs.y, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_negative(float %x, float %y) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_negative:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v1, -|v1|, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %neg.fabs.y = fneg float %fabs.y
+  %div = fdiv float %x, %neg.fabs.y, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_uitofp(float %x, i32 %y.i) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_uitofp:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.f = uitofp i32 %y.i to float
+  %div = fdiv float %x, %y.f, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; CHECK-NEXT:    v_max_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.maxnum.f32(float %y.abs, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.minnum.f32(float %y.abs, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_maximum_fabs(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_maximum_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v3, |v1|, |v2|
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; CHECK-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v2|
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.maximum.f32(float %y.abs, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_minimum_fabs(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_minimum_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_min_f32_e64 v3, |v1|, |v2|
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; CHECK-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v2|
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.minimum.f32(float %y.abs, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_maximumnum_fabs(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_maximumnum_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; CHECK-NEXT:    v_max_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.maximumnum.f32(float %y.abs, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+define float @fdiv_fast_daz_rhs_signbit_known_zero_minimumnum_fabs(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_minimumnum_fabs:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.minimumnum.f32(float %y.abs, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+; Negative test
+define float @fdiv_fast_daz_rhs_signbit_maybe0_minimumnum_0(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_maybe0_minimumnum_0:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e32 v1, v1, v1
+; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.minimumnum.f32(float %y, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+; Negative test
+define float @fdiv_fast_daz_rhs_signbit_maybe0_minimumnum_1(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_maybe0_minimumnum_1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e32 v2, v2, v2
+; CHECK-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %y)
+  %max.abs = call float @llvm.minimumnum.f32(float %y.abs, float %z)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+; Positive operand with maximum implies both operands are positive
+define float @fdiv_fast_daz_rhs_signbit_known0_maximumnum_one_fabs_0(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known0_maximumnum_one_fabs_0:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e32 v1, v2, v2
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e32 v1, v2, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %y.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.maximumnum.f32(float %y.abs, float %z)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+; Positive operand with maximum implies both operands are positive
+define float @fdiv_fast_daz_rhs_signbit_known0_maximumnum_one_fabs_1(float %x, float %y, float %z) #1 {
+; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known0_maximumnum_one_fabs_1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_f32_e64 v2, |v2|, |v2|
+; CHECK-NEXT:    v_max_f32_e32 v1, v1, v1
+; CHECK-NEXT:    v_max_f32_e32 v1, v1, v2
+; CHECK-NEXT:    s_mov_b32 s4, 0x6f800000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CHECK-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %z.abs = call float @llvm.fabs.f32(float %z)
+  %max.abs = call float @llvm.maximumnum.f32(float %y, float %z.abs)
+  %div = fdiv float %x, %max.abs, !fpmath !0
+  ret float %div
+}
+
+declare float @llvm.maxnum.f32(float, float) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maximum.f32(float, float) #0
+declare float @llvm.minimum.f32(float, float) #0
+declare float @llvm.maximumnum.f32(float, float) #0
+declare float @llvm.minimumnum.f32(float, float) #0
+
+attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+!0 = !{float 2.500000e+00}

@arsenm arsenm marked this pull request as ready for review December 16, 2025 20:58
@arsenm arsenm merged commit b971b51 into main Dec 16, 2025
15 checks passed
@arsenm arsenm deleted the users/arsenm/amdgpu/add-baseline-test-dag-fabs-fold branch December 16, 2025 22:26
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants