diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 69491c6f2c565..4482df15242d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2791,6 +2791,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth); } +// TODO: Should have argument to specify if sign bit of nan is ignorable. bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const { if (Depth >= MaxRecursionDepth) return false; // Limit search depth. @@ -2812,6 +2813,20 @@ bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const { case ISD::FEXP2: case ISD::FEXP10: return Op->getFlags().hasNoNaNs(); + case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMINIMUM: + case ISD::FMINIMUMNUM: + return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) && + SignBitIsZeroFP(Op.getOperand(0), Depth + 1); + case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMAXIMUMNUM: + // TODO: If we can ignore the sign bit of nans, only one side being known 0 + // is sufficient. + return SignBitIsZeroFP(Op.getOperand(1), Depth + 1) && + SignBitIsZeroFP(Op.getOperand(0), Depth + 1); default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ff50fdfe9b09f..afdeed658b76e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12336,7 +12336,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); + // TODO: The combiner should probably handle elimination of redundant fabs. + SDValue r1 = DAG.SignBitIsZeroFP(RHS) + ? RHS + : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); const APFloat K0Val(0x1p+96f); const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); diff --git a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll index 038252e4cb1e4..750f390e79110 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll @@ -73,7 +73,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs(float %x, float % ; CHECK-NEXT: v_max_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -97,7 +97,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs(float %x, float % ; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -122,7 +122,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maximum_fabs(float %x, float ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -147,7 +147,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minimum_fabs(float %x, float ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -171,7 +171,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maximumnum_fabs(float %x, flo ; CHECK-NEXT: v_max_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1 @@ -195,7 +195,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minimumnum_fabs(float %x, flo ; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; CHECK-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_rcp_f32_e32 v1, v1