Skip to content

Commit

Permalink
Use rsqrt (X86) to speed up reciprocal square root calcs
Browse files Browse the repository at this point in the history
This is a first step for generating SSE rsqrt instructions for
reciprocal square root calcs when fast-math is allowed.

For now, be conservative and only enable this for AMD btver2
where performance improves significantly - for example, 29%
on llvm/projects/test-suite/SingleSource/Benchmarks/BenchmarkGame/n-body.c
(if we convert the data type to single-precision float).

This patch adds a two constant version of the Newton-Raphson
refinement algorithm to DAGCombiner that can be selected by any target
via a parameter returned by getRsqrtEstimate()..

See PR20900 for more details:
http://llvm.org/bugs/show_bug.cgi?id=20900

Differential Revision: http://reviews.llvm.org/D5658

llvm-svn: 220570
  • Loading branch information
rotateright committed Oct 24, 2014
1 parent 5e3a421 commit 957efc2
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 46 deletions.
5 changes: 4 additions & 1 deletion llvm/include/llvm/Target/TargetLowering.h
Expand Up @@ -2652,13 +2652,16 @@ class TargetLowering : public TargetLoweringBase {
/// The RefinementSteps output is the number of Newton-Raphson refinement
/// iterations required to generate a sufficient (though not necessarily
/// IEEE-754 compliant) estimate for the value type.
/// The boolean UseOneConstNR output is used to select a Newton-Raphson
/// algorithm implementation that uses one constant or two constants.
/// A target may choose to implement its own refinement within this function.
/// If that's true, then return '0' as the number of RefinementSteps to avoid
/// any further refinement of the estimate.
/// An empty SDValue return means no estimate sequence can be created.
virtual SDValue getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const {
unsigned &RefinementSteps,
bool &UseOneConstNR) const {
return SDValue();
}

Expand Down
117 changes: 77 additions & 40 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -331,6 +331,8 @@ namespace {
SDValue BuildUDIV(SDNode *N);
SDValue BuildReciprocalEstimate(SDValue Op);
SDValue BuildRsqrtEstimate(SDValue Op);
SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations);
SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations);
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
Expand Down Expand Up @@ -7033,21 +7035,18 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) {
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
}
} else if (N1.getOpcode() == ISD::FP_EXTEND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
AddToWorklist(RV.getNode());
RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
}
} else if (N1.getOpcode() == ISD::FP_ROUND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
AddToWorklist(RV.getNode());
RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
Expand All @@ -7068,7 +7067,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// We found a FSQRT, so try to make this fold:
// x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) {
AddToWorklist(RV.getNode());
RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp);
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
Expand Down Expand Up @@ -7116,7 +7114,6 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
if (DAG.getTarget().Options.UnsafeFPMath) {
// Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
AddToWorklist(RV.getNode());
EVT VT = RV.getValueType();
RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV);
AddToWorklist(RV.getNode());
Expand Down Expand Up @@ -11985,49 +11982,89 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
return SDValue();
}

SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
if (Level >= AfterLegalizeDAG)
return SDValue();
/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
/// For the reciprocal sqrt, we need to find the zero of the function:
/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
/// =>
/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
/// As a result, we precompute A/2 prior to the iteration loop.
SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
unsigned Iterations) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue ThreeHalves = DAG.getConstantFP(1.5, VT);

// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
// this entire sequence requires only one FP constant.
SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
AddToWorklist(HalfArg.getNode());

HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
AddToWorklist(HalfArg.getNode());

// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
for (unsigned i = 0; i < Iterations; ++i) {
SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
AddToWorklist(NewEst.getNode());

NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
AddToWorklist(NewEst.getNode());

NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
AddToWorklist(NewEst.getNode());

Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
AddToWorklist(Est.getNode());
}
return Est;
}

// Expose the DAG combiner to the target combiner implementations.
TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
unsigned Iterations = 0;
if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations)) {
if (Iterations) {
// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
// For the reciprocal sqrt, we need to find the zero of the function:
// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
// =>
// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
// As a result, we precompute A/2 prior to the iteration loop.
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue FPThreeHalves = DAG.getConstantFP(1.5, VT);
/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
/// For the reciprocal sqrt, we need to find the zero of the function:
/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
/// =>
/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
unsigned Iterations) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue MinusThree = DAG.getConstantFP(-3.0, VT);
SDValue MinusHalf = DAG.getConstantFP(-0.5, VT);

AddToWorklist(Est.getNode());
// Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
for (unsigned i = 0; i < Iterations; ++i) {
SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
AddToWorklist(HalfEst.getNode());

// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
// this entire sequence requires only one FP constant.
SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, FPThreeHalves, Op);
AddToWorklist(HalfArg.getNode());
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
AddToWorklist(Est.getNode());

HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Op);
AddToWorklist(HalfArg.getNode());
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
AddToWorklist(Est.getNode());

// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
for (unsigned i = 0; i < Iterations; ++i) {
SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
AddToWorklist(NewEst.getNode());
Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree);
AddToWorklist(Est.getNode());

NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
AddToWorklist(NewEst.getNode());
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst);
AddToWorklist(Est.getNode());
}
return Est;
}

NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPThreeHalves, NewEst);
AddToWorklist(NewEst.getNode());
SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
if (Level >= AfterLegalizeDAG)
return SDValue();

Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
AddToWorklist(Est.getNode());
}
// Expose the DAG combiner to the target combiner implementations.
TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
unsigned Iterations = 0;
bool UseOneConstNR = false;
if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
AddToWorklist(Est.getNode());
if (Iterations) {
Est = UseOneConstNR ?
BuildRsqrtNROneConst(Op, Est, Iterations) :
BuildRsqrtNRTwoConst(Op, Est, Iterations);
}
return Est;
}
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Expand Up @@ -7466,7 +7466,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,

SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const {
unsigned &RefinementSteps,
bool &UseOneConstNR) const {
EVT VT = Operand.getValueType();
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
Expand All @@ -7479,6 +7480,7 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
if (VT.getScalarType() == MVT::f64)
++RefinementSteps;
UseOneConstNR = true;
return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
}
return SDValue();
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/PowerPC/PPCISelLowering.h
Expand Up @@ -702,7 +702,8 @@ namespace llvm {
SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;

SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const override;
unsigned &RefinementSteps,
bool &UseOneConstNR) const override;
SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const override;

Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/X86/X86.td
Expand Up @@ -182,6 +182,8 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
"Use RSQRT* to optimize square root calculations">;

//===----------------------------------------------------------------------===//
// X86 processors supported.
Expand Down Expand Up @@ -347,7 +349,8 @@ def : ProcessorModel<"btver2", BtVer2Model,
[FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
FeatureBMI, FeatureF16C, FeatureMOVBE,
FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
FeatureUseSqrtEst]>;

// Bulldozer
def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -14367,6 +14367,36 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}

/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
bool &UseOneConstNR) const {
// FIXME: We should use instruction latency models to calculate the cost of
// each potential sequence, but this is very hard to do reliably because
// at least Intel's Core* chips have variable timing based on the number of
// significant digits in the divisor and/or sqrt operand.
if (!Subtarget->useSqrtEst())
return SDValue();

EVT VT = Op.getValueType();

// SSE1 has rsqrtss and rsqrtps.
// TODO: Add support for AVX (v8f32) and AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) {
RefinementSteps = 1;
UseOneConstNR = false;
return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
}
return SDValue();
}

static bool isAllOnes(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
return C && C->isAllOnesValue();
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -1017,6 +1017,11 @@ namespace llvm {

/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;

/// Use rsqrt* to speed up sqrt calculations.
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
bool &UseOneConstNR) const override;
};

namespace X86 {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/X86/X86Subtarget.cpp
Expand Up @@ -278,6 +278,7 @@ void X86Subtarget::initializeEnvironment() {
LEAUsesAG = false;
SlowLEA = false;
SlowIncDec = false;
UseSqrtEst = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86Subtarget.h
Expand Up @@ -192,6 +192,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;

/// Use the RSQRT* instructions to optimize square root calculations.
/// For this to be profitable, the cost of FSQRT and FDIV must be
/// substantially higher than normal FP ops like FADD and FMUL.
bool UseSqrtEst;

/// Processor has AVX-512 PreFetch Instructions
bool HasPFI;

Expand Down Expand Up @@ -369,6 +374,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool useSqrtEst() const { return UseSqrtEst; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
Expand Down
55 changes: 53 additions & 2 deletions llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mcpu=core2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2

; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
; #include <math.h>
Expand Down Expand Up @@ -52,9 +53,59 @@ entry:
ret x86_fp80 %call
}

; Function Attrs: nounwind readnone
declare x86_fp80 @__sqrtl_finite(x86_fp80) #1

; If the target's sqrtss and divss instructions are substantially
; slower than rsqrtss with a Newton-Raphson refinement, we should
; generate the estimate sequence.
define float @reciprocal_square_root(float %x) #0 {
%sqrt = tail call float @llvm.sqrt.f32(float %x)
%div = fdiv fast float 1.0, %sqrt
ret float %div

; CHECK-LABEL: reciprocal_square_root:
; CHECK: sqrtss
; CHECK-NEXT: movss
; CHECK-NEXT: divss
; CHECK-NEXT: retq
; BTVER2-LABEL: reciprocal_square_root:
; BTVER2: vrsqrtss
; BTVER2-NEXT: vmulss
; BTVER2-NEXT: vmulss
; BTVER2-NEXT: vmulss
; BTVER2-NEXT: vaddss
; BTVER2-NEXT: vmulss
; BTVER2-NEXT: retq
}

declare float @llvm.sqrt.f32(float) #1

; If the target's sqrtps and divps instructions are substantially
; slower than rsqrtps with a Newton-Raphson refinement, we should
; generate the estimate sequence.
define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <4 x float> %div

; CHECK-LABEL: reciprocal_square_root_v4f32:
; CHECK: sqrtps
; CHECK-NEXT: movaps
; CHECK-NEXT: divps
; CHECK-NEXT: retq
; BTVER2-LABEL: reciprocal_square_root_v4f32:
; BTVER2: vrsqrtps
; BTVER2-NEXT: vmulps
; BTVER2-NEXT: vmulps
; BTVER2-NEXT: vmulps
; BTVER2-NEXT: vaddps
; BTVER2-NEXT: vmulps
; BTVER2-NEXT: retq
}

declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1


attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }

0 comments on commit 957efc2

Please sign in to comment.