Skip to content

Commit

Permalink
[DAGCombiner] try repeated fdiv divisor transform before building est…
Browse files Browse the repository at this point in the history
…imate

This was originally part of D61028, but it's an independent diff.

If we try the repeated divisor reciprocal transform before producing an estimate sequence,
then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5
vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the
full-precision division is only 3 cycle throughput, so that's probably the better perf
default option and avoids problems from x86's inaccurate estimates.

The last 2 tests show that users still have the option to override the defaults by using
the function attributes for reciprocal estimates, but those patterns are potentially made
faster by converting the vector ops (including ymm ops) to scalar math.

Differential Revision: https://reviews.llvm.org/D61149

llvm-svn: 359398
  • Loading branch information
rotateright committed Apr 28, 2019
1 parent 43003f0 commit fb9a530
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 42 deletions.
6 changes: 3 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -11992,6 +11992,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;

if (SDValue V = combineRepeatedFPDivisors(N))
return V;

if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (N1CFP) {
Expand Down Expand Up @@ -12081,9 +12084,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
}
}

if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
return CombineRepeatedDivisors;

return SDValue();
}

Expand Down
66 changes: 27 additions & 39 deletions llvm/test/CodeGen/X86/fdiv-combine-vec.ll
Expand Up @@ -51,25 +51,17 @@ define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) {
define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
; SSE-LABEL: splat_fdiv_v4f32:
; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE-NEXT: rcpps %xmm1, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm1
; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SSE-NEXT: subps %xmm1, %xmm3
; SSE-NEXT: mulps %xmm2, %xmm3
; SSE-NEXT: addps %xmm2, %xmm3
; SSE-NEXT: mulps %xmm3, %xmm0
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: divss %xmm1, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splat_fdiv_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vrcpps %xmm1, %xmm2
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vy = insertelement <4 x float> undef, float %y, i32 0
Expand All @@ -90,14 +82,10 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
;
; AVX-LABEL: splat_fdiv_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX-NEXT: vrcpps %ymm1, %ymm2
; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vy = insertelement <8 x float> undef, float %y, i32 0
Expand All @@ -109,25 +97,25 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
; SSE-LABEL: splat_fdiv_v4f32_estimate:
; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE-NEXT: rcpps %xmm1, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm1
; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SSE-NEXT: subps %xmm1, %xmm3
; SSE-NEXT: mulps %xmm2, %xmm3
; SSE-NEXT: addps %xmm2, %xmm3
; SSE-NEXT: rcpss %xmm1, %xmm2
; SSE-NEXT: mulss %xmm2, %xmm1
; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE-NEXT: subss %xmm1, %xmm3
; SSE-NEXT: mulss %xmm2, %xmm3
; SSE-NEXT: addss %xmm2, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
; SSE-NEXT: mulps %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splat_fdiv_v4f32_estimate:
; AVX: # %bb.0:
; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vrcpps %xmm1, %xmm2
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vy = insertelement <4 x float> undef, float %y, i32 0
Expand All @@ -152,14 +140,14 @@ define <8 x float> @splat_fdiv_v8f32_estimate(<8 x float> %x, float %y) #0 {
;
; AVX-LABEL: splat_fdiv_v8f32_estimate:
; AVX: # %bb.0:
; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX-NEXT: vrcpps %ymm1, %ymm2
; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vy = insertelement <8 x float> undef, float %y, i32 0
Expand Down

0 comments on commit fb9a530

Please sign in to comment.