-
Notifications
You must be signed in to change notification settings - Fork 15.5k
AMDGPU: Avoid introducing unnecessary fabs in fast fdiv lowering #172553
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesIf the sign bit of the denominator is known 0, do not emit the fabs. I originally tried to do this as the general combine on fabs, but This defends against future code size regressions in the atan2 and Full diff: https://github.com/llvm/llvm-project/pull/172553.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 69491c6f2c565..4482df15242d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2791,6 +2791,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
}
+// TODO: Should have argument to specify if sign bit of nan is ignorable.
bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
if (Depth >= MaxRecursionDepth)
return false; // Limit search depth.
@@ -2812,6 +2813,20 @@ bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
case ISD::FEXP2:
case ISD::FEXP10:
return Op->getFlags().hasNoNaNs();
+ case ISD::FMINNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMINIMUMNUM:
+ return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) &&
+ SignBitIsZeroFP(Op.getOperand(0), Depth + 1);
+ case ISD::FMAXNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMAXIMUMNUM:
+ // TODO: If we can ignore the sign bit of nans, only one side being known 0
+ // is sufficient.
+ return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) &&
+ SignBitIsZeroFP(Op.getOperand(0), Depth + 1);
default:
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ff50fdfe9b09f..afdeed658b76e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12336,7 +12336,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
+ // TODO: The combiner should probably handle elimination of redundant fabs.
+ SDValue r1 = DAG.SignBitIsZeroFP(RHS)
+ ? RHS
+ : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
const APFloat K0Val(0x1p+96f);
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
diff --git a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
index 038252e4cb1e4..750f390e79110 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll
@@ -73,7 +73,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs(float %x, float %
; CHECK-NEXT: v_max_f32_e32 v1, v1, v2
; CHECK-NEXT: s_mov_b32 s4, 0x6f800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
; CHECK-NEXT: v_rcp_f32_e32 v1, v1
@@ -97,7 +97,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs(float %x, float %
; CHECK-NEXT: v_min_f32_e32 v1, v1, v2
; CHECK-NEXT: s_mov_b32 s4, 0x6f800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
; CHECK-NEXT: v_rcp_f32_e32 v1, v1
@@ -122,7 +122,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maximum_fabs(float %x, float
; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; CHECK-NEXT: s_mov_b32 s4, 0x6f800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
; CHECK-NEXT: v_rcp_f32_e32 v1, v1
@@ -147,7 +147,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minimum_fabs(float %x, float
; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; CHECK-NEXT: s_mov_b32 s4, 0x6f800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
; CHECK-NEXT: v_rcp_f32_e32 v1, v1
@@ -171,7 +171,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maximumnum_fabs(float %x, flo
; CHECK-NEXT: v_max_f32_e32 v1, v1, v2
; CHECK-NEXT: s_mov_b32 s4, 0x6f800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
; CHECK-NEXT: v_rcp_f32_e32 v1, v1
@@ -195,7 +195,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minimumnum_fabs(float %x, flo
; CHECK-NEXT: v_min_f32_e32 v1, v1, v2
; CHECK-NEXT: s_mov_b32 s4, 0x6f800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000
-; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
; CHECK-NEXT: v_rcp_f32_e32 v1, v1
|
The compiler knows how to select the right division path depending on the denormal mode (and based on the implied 2.5 ulp limit by the OpenCL deafults). This results in almost identical code. Currently the new result has a code size regression due to an unnecessary use of a droppable fabs modifier (which llvm#172553 avoids).
If the sign bit of the denominator is known 0, do not emit the fabs. Also, extend this to handle min/max with fabs inputs. I originally tried to do this as the general combine on fabs, but it proved to be too much trouble at this time. This is mostly complexity introduced by expanding the various min/maxes into canonicalizes, and then not being able to assume the sign bit of canonicalize (fabs x) without nnan. This defends against future code size regressions in the atan2 and atan2pi library functions.
ae61164 to
e8d0ee0
Compare

If the sign bit of the denominator is known 0, do not emit the fabs.
Also, extend this to handle min/max with fabs inputs.
I originally tried to do this as the general combine on fabs, but
it proved to be too much trouble at this time. This is mostly
complexity introduced by expanding the various min/maxes into
canonicalizes, and then not being able to assume the sign bit
of canonicalize (fabs x) without nnan.
This defends against future code size regressions in the atan2 and
atan2pi library functions.