Skip to content

Commit

Permalink
[DAG Combiner] Fix the native computation of the Newton series for re…
Browse files Browse the repository at this point in the history
…ciprocals

The generic infrastructure to compute the Newton series for reciprocal and
reciprocal square root was conceived to allow a target to compute the series
itself.  However, the original code did not properly consider this condition
if returned by a target.  This patch addresses the issues to allow a target
to compute the series on its own.

Differential revision: https://reviews.llvm.org/D22975

llvm-svn: 286523
  • Loading branch information
Evandro Menezes committed Nov 10, 2016
1 parent 16d52a2 commit 21f9ce1
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 59 deletions.
13 changes: 8 additions & 5 deletions llvm/include/llvm/Target/TargetLowering.h
Expand Up @@ -2986,21 +2986,24 @@ class TargetLowering : public TargetLoweringBase {
/// Hooks for building estimates in place of slower divisions and square
/// roots.

/// Return a reciprocal square root estimate value for the input operand.
/// Return either a square root or its reciprocal estimate value for the input
/// operand.
/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
/// 'Enabled' as set by a potential default override attribute.
/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
/// refinement iterations required to generate a sufficient (though not
/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
/// The boolean UseOneConstNR output is used to select a Newton-Raphson
/// algorithm implementation that uses one constant or two constants.
/// algorithm implementation that uses either one or two constants.
/// The boolean Reciprocal is used to select whether the estimate is for the
/// square root of the input operand or the reciprocal of its square root.
/// A target may choose to implement its own refinement within this function.
/// If that's true, then return '0' as the number of RefinementSteps to avoid
/// any further refinement of the estimate.
/// An empty SDValue return means no estimate sequence can be created.
virtual SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR) const {
virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR, bool Reciprocal) const {
return SDValue();
}

Expand Down
58 changes: 30 additions & 28 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -14928,6 +14928,12 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
return S;
}

/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
/// For the reciprocal, we need to find the zero of the function:
/// F(X) = A X - 1 [which has a zero at X = 1/A]
/// =>
/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
/// does not require additional intermediate precision]
SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
if (Level >= AfterLegalizeDAG)
return SDValue();
Expand All @@ -14947,19 +14953,13 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
// refinement steps.
int Iterations = TLI.getDivRefinementSteps(VT, MF);
if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
AddToWorklist(Est.getNode());

if (Iterations) {
// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
// For the reciprocal, we need to find the zero of the function:
// F(X) = A X - 1 [which has a zero at X = 1/A]
// =>
// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
// does not require additional intermediate precision]
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);

AddToWorklist(Est.getNode());

// Newton iterations: Est = Est + Est (1 - Arg * Est)
for (int i = 0; i < Iterations; ++i) {
SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
Expand Down Expand Up @@ -15100,12 +15100,30 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags,

bool UseOneConstNR = false;
if (SDValue Est =
TLI.getRsqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR)) {
TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
Reciprocal)) {
AddToWorklist(Est.getNode());

if (Iterations) {
Est = UseOneConstNR
? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);

if (!Reciprocal) {
// Unfortunately, Est is now NaN if the input was exactly 0.0.
// Select out this case and force the answer to 0.0.
EVT VT = Op.getValueType();
SDLoc DL(Op);

SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
EVT CCVT = getSetCCResultType(VT);
SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
AddToWorklist(ZeroCmp.getNode());

Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
ZeroCmp, FPZero, Est);
AddToWorklist(Est.getNode());
}
}
return Est;
}
Expand All @@ -15118,23 +15136,7 @@ SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
}

SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
SDValue Est = buildSqrtEstimateImpl(Op, Flags, false);
if (!Est)
return SDValue();

// Unfortunately, Est is now NaN if the input was exactly 0.
// Select out this case and force the answer to 0.
EVT VT = Est.getValueType();
SDLoc DL(Op);
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
EVT CCVT = getSetCCResultType(VT);
SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, Zero, ISD::SETEQ);
AddToWorklist(ZeroCmp.getNode());

Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, ZeroCmp,
Zero, Est);
AddToWorklist(Est.getNode());
return Est;
return buildSqrtEstimateImpl(Op, Flags, false);
}

/// Return true if base is a frame index, which is known not to alias with
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -4644,10 +4644,11 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
return SDValue();
}

SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps,
bool &UseOneConst) const {
SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps,
bool &UseOneConst,
bool Reciprocal) const {
if (Enabled == ReciprocalEstimate::Enabled ||
(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -534,8 +534,9 @@ class AArch64TargetLowering : public TargetLowering {

SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &ExtraSteps, bool &UseOneConst) const override;
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &ExtraSteps, bool &UseOneConst,
bool Reciprocal) const override;
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &ExtraSteps) const override;
unsigned combineRepeatedFPDivisors() const override;
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -2978,10 +2978,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
return nullptr;
}

SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR) const {
SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
EVT VT = Operand.getValueType();

if (VT == MVT::f32) {
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Expand Up @@ -172,9 +172,9 @@ class AMDGPUTargetLowering : public TargetLowering {
bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
return true;
}
SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR) const override;
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps, bool &UseOneConstNR,
bool Reciprocal) const override;
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps) const override;

Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Expand Up @@ -9637,9 +9637,10 @@ static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
return RefinementSteps;
}

SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR) const {
SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
EVT VT = Operand.getValueType();
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/PowerPC/PPCISelLowering.h
Expand Up @@ -968,9 +968,9 @@ namespace llvm {
SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;

SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR) const override;
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps, bool &UseOneConstNR,
bool Reciprocal) const override;
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps) const override;
unsigned combineRepeatedFPDivisors() const override;
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -15296,10 +15296,11 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR) const {
SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
EVT VT = Op.getValueType();

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -1268,9 +1268,9 @@ namespace llvm {
bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;

/// Use rsqrt* to speed up sqrt calculations.
SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR) const override;
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps, bool &UseOneConstNR,
bool Reciprocal) const override;

/// Use rcp* to speed up fdiv calculations.
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
Expand Down

0 comments on commit 21f9ce1

Please sign in to comment.