Skip to content

Commit

Permalink
[PowerPC] Use the two-constant NR algorithm for refining estimates
Browse files Browse the repository at this point in the history
The single-constant algorithm produces infinities on a lot of denormal values.
The precision of the two-constant algorithm is actually sufficient across the
range of denormals. We will switch to that algorithm for now to avoid the
infinities on denormals. In the future, we will re-evaluate the algorithm to
find the optimal one for PowerPC.

Differential revision: https://reviews.llvm.org/D60037

llvm-svn: 360144
  • Loading branch information
nemanjai authored and MrSidims committed May 24, 2019
1 parent 685c203 commit 4cb7642
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 49 deletions.
5 changes: 4 additions & 1 deletion llvm/lib/Target/PowerPC/PPC.td
Expand Up @@ -135,6 +135,9 @@ def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true",
def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true",
"Enable VSX instructions",
[FeatureAltivec]>;
def FeatureTwoConstNR :
SubtargetFeature<"two-const-nr", "NeedsTwoConstNR", "true",
"Requires two constant Newton-Raphson computation">;
def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true",
"Enable POWER8 Altivec instructions",
[FeatureAltivec]>;
Expand Down Expand Up @@ -227,7 +230,7 @@ def ProcessorFeatures {
FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
Feature64Bit /*, Feature64BitRegs */,
FeatureBPERMD, FeatureExtDiv,
FeatureMFTB, DeprecatedDST];
FeatureMFTB, DeprecatedDST, FeatureTwoConstNR];
list<SubtargetFeature> Power8SpecificFeatures =
[DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic,
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Expand Up @@ -11145,7 +11145,9 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);

UseOneConstNR = true;
// The Newton-Raphson computation with a single constant does not provide
// enough accuracy on some CPUs.
UseOneConstNR = !Subtarget.needsTwoConstNR();
return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
}
return SDValue();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/PowerPC/PPCSubtarget.cpp
Expand Up @@ -67,6 +67,7 @@ void PPCSubtarget::initializeEnvironment() {
HasFPU = false;
HasQPX = false;
HasVSX = false;
NeedsTwoConstNR = false;
HasP8Vector = false;
HasP8Altivec = false;
HasP8Crypto = false;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/PowerPC/PPCSubtarget.h
Expand Up @@ -98,6 +98,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
bool HasSPE;
bool HasQPX;
bool HasVSX;
bool NeedsTwoConstNR;
bool HasP8Vector;
bool HasP8Altivec;
bool HasP8Crypto;
Expand Down Expand Up @@ -246,6 +247,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
bool hasFPU() const { return HasFPU; }
bool hasQPX() const { return HasQPX; }
bool hasVSX() const { return HasVSX; }
bool needsTwoConstNR() const { return NeedsTwoConstNR; }
bool hasP8Vector() const { return HasP8Vector; }
bool hasP8Altivec() const { return HasP8Altivec; }
bool hasP8Crypto() const { return HasP8Crypto; }
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/PowerPC/fma-mutate.ll
Expand Up @@ -14,8 +14,7 @@ define double @foo3(double %a) nounwind {
ret double %r

; CHECK: @foo3
; CHECK: fmr [[REG:[0-9]+]], [[REG2:[0-9]+]]
; CHECK: xsnmsubadp [[REG]], {{[0-9]+}}, [[REG2]]
; CHECK-NOT: fmr
; CHECK: xsmaddmdp
; CHECK: xsmaddadp
}
Expand Down
69 changes: 33 additions & 36 deletions llvm/test/CodeGen/PowerPC/fmf-propagation.ll
Expand Up @@ -284,16 +284,16 @@ define float @sqrt_afn(float %x) {
; FMF-NEXT: fcmpu 0, 1, 0
; FMF-NEXT: beq 0, .LBB10_2
; FMF-NEXT: # %bb.1:
; FMF-NEXT: xsrsqrtesp 0, 1
; FMF-NEXT: addis 3, 2, .LCPI10_0@toc@ha
; FMF-NEXT: xsrsqrtesp 3, 1
; FMF-NEXT: lfs 0, .LCPI10_0@toc@l(3)
; FMF-NEXT: xsmulsp 2, 1, 0
; FMF-NEXT: xsmulsp 4, 3, 3
; FMF-NEXT: xssubsp 2, 2, 1
; FMF-NEXT: xsmulsp 2, 2, 4
; FMF-NEXT: xssubsp 0, 0, 2
; FMF-NEXT: xsmulsp 0, 3, 0
; FMF-NEXT: xsmulsp 0, 0, 1
; FMF-NEXT: addis 4, 2, .LCPI10_1@toc@ha
; FMF-NEXT: lfs 2, .LCPI10_0@toc@l(3)
; FMF-NEXT: lfs 3, .LCPI10_1@toc@l(4)
; FMF-NEXT: xsmulsp 1, 1, 0
; FMF-NEXT: xsmulsp 0, 1, 0
; FMF-NEXT: xsmulsp 1, 1, 2
; FMF-NEXT: xsaddsp 0, 0, 3
; FMF-NEXT: xsmulsp 0, 1, 0
; FMF-NEXT: .LBB10_2:
; FMF-NEXT: fmr 1, 0
; FMF-NEXT: blr
Expand All @@ -304,16 +304,15 @@ define float @sqrt_afn(float %x) {
; GLOBAL-NEXT: fcmpu 0, 1, 0
; GLOBAL-NEXT: beq 0, .LBB10_2
; GLOBAL-NEXT: # %bb.1:
; GLOBAL-NEXT: xsrsqrtesp 2, 1
; GLOBAL-NEXT: fneg 0, 1
; GLOBAL-NEXT: xsrsqrtesp 0, 1
; GLOBAL-NEXT: addis 3, 2, .LCPI10_0@toc@ha
; GLOBAL-NEXT: fmr 4, 1
; GLOBAL-NEXT: lfs 3, .LCPI10_0@toc@l(3)
; GLOBAL-NEXT: xsmaddasp 4, 0, 3
; GLOBAL-NEXT: xsmulsp 0, 2, 2
; GLOBAL-NEXT: xsmaddasp 3, 4, 0
; GLOBAL-NEXT: xsmulsp 0, 2, 3
; GLOBAL-NEXT: xsmulsp 0, 0, 1
; GLOBAL-NEXT: addis 4, 2, .LCPI10_1@toc@ha
; GLOBAL-NEXT: lfs 2, .LCPI10_0@toc@l(3)
; GLOBAL-NEXT: lfs 3, .LCPI10_1@toc@l(4)
; GLOBAL-NEXT: xsmulsp 1, 1, 0
; GLOBAL-NEXT: xsmaddasp 2, 1, 0
; GLOBAL-NEXT: xsmulsp 0, 1, 3
; GLOBAL-NEXT: xsmulsp 0, 0, 2
; GLOBAL-NEXT: .LBB10_2:
; GLOBAL-NEXT: fmr 1, 0
; GLOBAL-NEXT: blr
Expand All @@ -338,16 +337,15 @@ define float @sqrt_fast(float %x) {
; FMF-NEXT: fcmpu 0, 1, 0
; FMF-NEXT: beq 0, .LBB11_2
; FMF-NEXT: # %bb.1:
; FMF-NEXT: xsrsqrtesp 2, 1
; FMF-NEXT: fneg 0, 1
; FMF-NEXT: xsrsqrtesp 0, 1
; FMF-NEXT: addis 3, 2, .LCPI11_0@toc@ha
; FMF-NEXT: fmr 4, 1
; FMF-NEXT: lfs 3, .LCPI11_0@toc@l(3)
; FMF-NEXT: xsmaddasp 4, 0, 3
; FMF-NEXT: xsmulsp 0, 2, 2
; FMF-NEXT: xsmaddasp 3, 4, 0
; FMF-NEXT: xsmulsp 0, 2, 3
; FMF-NEXT: xsmulsp 0, 0, 1
; FMF-NEXT: addis 4, 2, .LCPI11_1@toc@ha
; FMF-NEXT: lfs 2, .LCPI11_0@toc@l(3)
; FMF-NEXT: lfs 3, .LCPI11_1@toc@l(4)
; FMF-NEXT: xsmulsp 1, 1, 0
; FMF-NEXT: xsmaddasp 2, 1, 0
; FMF-NEXT: xsmulsp 0, 1, 3
; FMF-NEXT: xsmulsp 0, 0, 2
; FMF-NEXT: .LBB11_2:
; FMF-NEXT: fmr 1, 0
; FMF-NEXT: blr
Expand All @@ -358,16 +356,15 @@ define float @sqrt_fast(float %x) {
; GLOBAL-NEXT: fcmpu 0, 1, 0
; GLOBAL-NEXT: beq 0, .LBB11_2
; GLOBAL-NEXT: # %bb.1:
; GLOBAL-NEXT: xsrsqrtesp 2, 1
; GLOBAL-NEXT: fneg 0, 1
; GLOBAL-NEXT: xsrsqrtesp 0, 1
; GLOBAL-NEXT: addis 3, 2, .LCPI11_0@toc@ha
; GLOBAL-NEXT: fmr 4, 1
; GLOBAL-NEXT: lfs 3, .LCPI11_0@toc@l(3)
; GLOBAL-NEXT: xsmaddasp 4, 0, 3
; GLOBAL-NEXT: xsmulsp 0, 2, 2
; GLOBAL-NEXT: xsmaddasp 3, 4, 0
; GLOBAL-NEXT: xsmulsp 0, 2, 3
; GLOBAL-NEXT: xsmulsp 0, 0, 1
; GLOBAL-NEXT: addis 4, 2, .LCPI11_1@toc@ha
; GLOBAL-NEXT: lfs 2, .LCPI11_0@toc@l(3)
; GLOBAL-NEXT: lfs 3, .LCPI11_1@toc@l(4)
; GLOBAL-NEXT: xsmulsp 1, 1, 0
; GLOBAL-NEXT: xsmaddasp 2, 1, 0
; GLOBAL-NEXT: xsmulsp 0, 1, 3
; GLOBAL-NEXT: xsmulsp 0, 0, 2
; GLOBAL-NEXT: .LBB11_2:
; GLOBAL-NEXT: fmr 1, 0
; GLOBAL-NEXT: blr
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/PowerPC/recipest.ll
Expand Up @@ -14,15 +14,16 @@ define double @foo(double %a, double %b) nounwind {
ret double %r

; CHECK: @foo
; CHECK-DAG: frsqrte
; CHECK-DAG: fnmsub
; CHECK: frsqrte
; CHECK: fmul
; CHECK-NEXT: fmadd
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmadd
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK: blr

; CHECK-SAFE: @foo
Expand Down Expand Up @@ -53,10 +54,10 @@ define double @foof(double %a, float %b) nounwind {

; CHECK: @foof
; CHECK-DAG: frsqrtes
; CHECK-DAG: fnmsubs
; CHECK: fmuls
; CHECK-NEXT: fmadds
; CHECK-NEXT: fmuls
; CHECK-NEXT: fmuls
; CHECK-NEXT: fmul
; CHECK-NEXT: blr

Expand All @@ -74,13 +75,14 @@ define float @food(float %a, double %b) nounwind {

; CHECK: @foo
; CHECK-DAG: frsqrte
; CHECK-DAG: fnmsub
; CHECK: fmul
; CHECK-NEXT: fmadd
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmadd
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: frsp
; CHECK-NEXT: fmuls
; CHECK-NEXT: blr
Expand All @@ -98,11 +100,11 @@ define float @goo(float %a, float %b) nounwind {

; CHECK: @goo
; CHECK-DAG: frsqrtes
; CHECK-DAG: fnmsubs
; CHECK: fmuls
; CHECK-NEXT: fmadds
; CHECK-NEXT: fmuls
; CHECK-NEXT: fmuls
; CHECK-NEXT: fmuls
; CHECK-NEXT: blr

; CHECK-SAFE: @goo
Expand Down Expand Up @@ -138,7 +140,6 @@ define float @rsqrt_fmul(float %a, float %b, float %c) {
; CHECK-DAG: fres
; CHECK-DAG: fnmsubs
; CHECK-DAG: fmuls
; CHECK-DAG: fnmsubs
; CHECK-DAG: fmadds
; CHECK-DAG: fmadds
; CHECK: fmuls
Expand Down Expand Up @@ -219,11 +220,11 @@ define double @foo3(double %a) nounwind {
; CHECK: @foo3
; CHECK: fcmpu
; CHECK-DAG: frsqrte
; CHECK-DAG: fnmsub
; CHECK: fmul
; CHECK-NEXT: fmadd
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
; CHECK-NEXT: fmadd
; CHECK-NEXT: fmul
; CHECK-NEXT: fmul
Expand All @@ -241,7 +242,6 @@ define float @goo3(float %a) nounwind {
; CHECK: @goo3
; CHECK: fcmpu
; CHECK-DAG: frsqrtes
; CHECK-DAG: fnmsubs
; CHECK: fmuls
; CHECK-NEXT: fmadds
; CHECK-NEXT: fmuls
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
Expand Up @@ -8,7 +8,7 @@ entry:
br i1 undef, label %for.body.lr.ph, label %for.end

; CHECK-LABEL: @LSH_recall_init
; CHECK: xsnmsubadp
; CHECK: xsmaddadp

for.body.lr.ph: ; preds = %entry
%conv3 = fpext float %W to double
Expand Down

0 comments on commit 4cb7642

Please sign in to comment.