Skip to content

Commit

Permalink
[X86] Insert FMUL for estimated non reciprocal SQRT when `RefinementS…
Browse files Browse the repository at this point in the history
…teps` = 0

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D114843
  • Loading branch information
phoebewang committed Dec 2, 2021
1 parent fcd2d85 commit 4756a2f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 13 deletions.
5 changes: 4 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -23228,7 +23228,10 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
UseOneConstNR = false;
// There is no FSQRT for 512-bits, but there is RSQRT14.
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
return DAG.getNode(Opcode, DL, VT, Op);
SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
if (RefinementSteps == 0 && !Reciprocal)
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
return Estimate;
}

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
Expand Down
30 changes: 18 additions & 12 deletions llvm/test/CodeGen/X86/sqrt-fastmath.ll
Expand Up @@ -388,22 +388,25 @@ define float @f32_estimate2(float %x) #5 {
; SSE-LABEL: f32_estimate2:
; SSE: # %bb.0:
; SSE-NEXT: rsqrtss %xmm0, %xmm1
; SSE-NEXT: mulss %xmm0, %xmm1
; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: andnps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: f32_estimate2:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
; AVX1-NEXT: vandnps %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: f32_estimate2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
Expand Down Expand Up @@ -481,6 +484,7 @@ define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 {
; SSE-LABEL: v4f32_estimate2:
; SSE: # %bb.0:
; SSE-NEXT: rsqrtps %xmm0, %xmm2
; SSE-NEXT: mulps %xmm0, %xmm2
; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SSE-NEXT: cmpleps %xmm0, %xmm1
Expand All @@ -490,21 +494,23 @@ define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 {
;
; AVX1-LABEL: v4f32_estimate2:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vrsqrtps %xmm0, %xmm0
; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: v4f32_estimate2:
; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX512-NEXT: vcmpleps %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vrsqrtps %xmm0, %xmm0
; AVX512-NEXT: vandps %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
ret <4 x float> %sqrt
Expand Down

0 comments on commit 4756a2f

Please sign in to comment.