DAG: Fix incorrect folding of fmul -1 to fneg

The fmul is a canonicalizing operation, and fneg is not so this would break denormals that need flushing and also would not quiet signaling nans. Fold to fsub instead, which is also canonicalizing.
llvm · Sep 15, 2021 · 54d755a · 54d755a
1 parent 299b5d4
commit 54d755a
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 17 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14004,10 +14004,13 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
 
-  // fold (fmul X, -1.0) -> (fneg X)
-  if (N1CFP && N1CFP->isExactlyValue(-1.0))
-    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, DL, VT, N0);
+  // fold (fmul X, -1.0) -> (fsub -0.0, X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0)) {
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FSUB, VT)) {
+      return DAG.getNode(ISD::FSUB, DL, VT,
+                         DAG.getConstantFP(-0.0, DL, VT), N0, Flags);
+    }
+  }
 
   // -N0 * -N1 --> N0 * N1
   TargetLowering::NegatibleCost CostN0 =

diff --git a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -82,7 +82,7 @@ define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-NEXT:    fmsub d0, d0, d1, d2
 ; CHECK-NEXT:    ret
 entry:
-  %mul = fmul double %b, -1.000000e+00
+  %mul = fneg double %b
   %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
   ret double %0
 }
@@ -93,7 +93,7 @@ define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp
 ; CHECK-NEXT:    fmsub d0, d1, d0, d2
 ; CHECK-NEXT:    ret
 entry:
-  %mul = fmul double %b, -1.000000e+00
+  %mul = fneg double %b
   %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
   ret double %0
 }
@@ -104,7 +104,7 @@ define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-NEXT:    fnmsub d0, d0, d1, d2
 ; CHECK-NEXT:    ret
 entry:
-  %mul = fmul double %c, -1.000000e+00
+  %mul = fneg double %c
   %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
   ret double %0
 }

diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
@@ -22,7 +22,7 @@ define half @fms16(half %a, half %b, half %c) nounwind readnone ssp {
 ; CHECK-LABEL: fms16:
 ; CHECK: fmsub h0, h0, h1, h2
 entry:
-  %mul = fmul half %b, -1.000000e+00
+  %mul = fneg half %b
   %0 = tail call half @llvm.fma.f16(half %a, half %mul, half %c)
   ret half %0
 }
@@ -32,7 +32,7 @@ define half @fms16_com(half %a, half %b, half %c) nounwind readnone ssp {
 ; CHECK:       fmsub h0, h1, h0, h2
 ; CHECK-NEXT:  ret
 entry:
-  %mul = fmul half %b, -1.000000e+00
+  %mul = fneg half %b
   %0 = tail call half @llvm.fma.f16(half %mul, half %a, half %c)
   ret half %0
 }
@@ -42,7 +42,7 @@ define half @fnms16(half %a, half %b, half %c) nounwind readnone ssp {
 ; CHECK:       fnmsub h0, h0, h1, h2
 ; CHECK-NEXT:  ret
 entry:
-  %mul = fmul half %c, -1.000000e+00
+  %mul = fneg half %c
   %0 = tail call half @llvm.fma.f16(half %a, half %b, half %mul)
   ret half %0
 }

diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -2597,6 +2597,51 @@ bb:
   ret <2 x float> %i6
 }
 
+; This expects denormal flushing, so can't turn this fmul into fneg
+; TODO: Keeping this as fmul saves encoding size
+; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg:
+; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0
+; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1
+define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
+  %mul = fmul float %x, -1.0
+  %add = fmul nnan float %mul, %y
+  ret float %add
+}
+
+; It's legal to turn this fmul into an fneg since denormals are
+; preserved and we know an snan can't happen from the flag.
+; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg:
+; GCN: v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT: s_setpc_b64
+define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
+  %mul = fmul nnan float %x, -1.0
+  %add = fmul float %mul, %y
+  ret float %add
+}
+
+; know the source can't be an snan
+; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg:
+; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0
+; GCN: v_mul_f32_e32 v0, [[TMP]], v1
+; GCN-NEXT: s_setpc_b64
+define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
+  %canonical = fmul float %x, %x
+  %mul = fmul float %canonical, -1.0
+  %add = fmul float %mul, %y
+  ret float %add
+}
+
+; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg:
+; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0
+; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]]
+; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1
+define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
+  %quiet = call float @llvm.canonicalize.f32(float %x)
+  %mul = fmul float %quiet, -1.0
+  %add = fmul float %mul, %y
+  ret float %add
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fma.f32(float, float, float) #1
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)

diff --git a/llvm/test/CodeGen/ARM/fnegs.ll b/llvm/test/CodeGen/ARM/fnegs.ll
@@ -49,7 +49,7 @@ entry:
 define float @test2(float* %a) {
 entry:
 	%0 = load float, float* %a, align 4		; <float> [#uses=2]
-	%1 = fmul float -1.000000e+00, %0		; <float> [#uses=2]
+	%1 = fneg float %0                  ; <float> [#uses=2]
 	%2 = fpext float %1 to double		; <double> [#uses=1]
 	%3 = fcmp olt double %2, 1.234000e+00		; <i1> [#uses=1]
 	%retval = select i1 %3, float %1, float %0		; <float> [#uses=1]

diff --git a/llvm/test/CodeGen/Hexagon/opt-fneg.ll b/llvm/test/CodeGen/Hexagon/opt-fneg.ll
@@ -3,6 +3,7 @@
 
 define float @foo(float %x) nounwind {
 entry:
+; CHECK-LABEL: foo:
 ; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
   %x.addr = alloca float, align 4
   store float %x, float* %x.addr, align 4
@@ -13,14 +14,25 @@ entry:
 
 define float @bar(float %x) nounwind {
 entry:
+; CHECK-LABEL: bar:
 ; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
   %sub = fsub float -0.000000e+00, %x
   ret float %sub
 }
 
-define float @baz(float %x) nounwind {
+define float @baz0(float %x) nounwind {
 entry:
+; CHECK-LABEL: baz0:
 ; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
-  %conv1 = fmul float %x, -1.000000e+00
+  %conv1 = fmul nnan float %x, -1.000000e+00
+  ret float %conv1
+}
+
+define float @baz1(float %x) nounwind {
+entry:
+  %not.nan = fadd nnan float %x, %x
+; CHECK-LABEL: baz1:
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
+  %conv1 = fmul float %not.nan, -1.000000e+00
   ret float %conv1
 }
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -13,10 +13,10 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
 ; CHECK-NEXT:    xvredp 2, 0
 ; CHECK-NEXT:    xxswapd 1, 1
 ; CHECK-NEXT:    xxlor 3, 1, 1
-; CHECK-NEXT:    xvnmsubadp 3, 0, 2
-; CHECK-NEXT:    xvmaddadp 2, 2, 3
-; CHECK-NEXT:    xvnmsubadp 1, 0, 2
-; CHECK-NEXT:    xvnmaddadp 2, 2, 1
+; CHECK-NEXT:    xvmaddadp 3, 0, 2
+; CHECK-NEXT:    xvnmsubadp 2, 2, 3
+; CHECK-NEXT:    xvmaddadp 1, 0, 2
+; CHECK-NEXT:    xvmsubadp 2, 2, 1
 ; CHECK-NEXT:    xvmuldp 34, 34, 2
 ; CHECK-NEXT:    xvmuldp 35, 35, 2
 ; CHECK-NEXT:    blr