[DAG] Restrict (fp_round (copysign X, Y)) -> (copysign (fp_round X), …

…Y) combine This transformation creates an copysign node whose argument types do not match. RISCV does not handle such a case which results in a crash today. Looking at the relevant code in DAG, it looks like the process of enabling the non-matching types case was never completed for vectors at all. The transformation which triggered the RISCV crash is a specialization of another transform (specifically due to one use for profitability) which isn't enabled by default. Given that, I chose to match the preconditions for that other transform. Other options here include: * Updating RISCV codegen to handle the mismatched argument type case for vectors. This is slightly tricky as I don't see an obvious profitable lowering for this case which doesn't involve simply adding back in the round/trunc. * Disabling the transform via a target hook. This patch does involve two changes for AArch64 codegen. These could be called regressions, but well, the code after actually looks better than the code before. Differential Revision: https://reviews.llvm.org/D148638
llvm · Apr 18, 2023 · c37c9f2 · c37c9f2
1 parent 790c9ac
commit c37c9f2
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 37 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16611,27 +16611,30 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)
 /// copysign(x, fp_round(y)) -> copysign(x, y)
-static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
-  SDValue N1 = N->getOperand(1);
-  if ((N1.getOpcode() == ISD::FP_EXTEND ||
-       N1.getOpcode() == ISD::FP_ROUND)) {
-    EVT N1VT = N1->getValueType(0);
-    EVT N1Op0VT = N1->getOperand(0).getValueType();
+/// Operands to the functions are the type of X and Y respectively.
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(EVT XTy, EVT YTy) {
+  // Always fold no-op FP casts.
+  if (XTy == YTy)
+    return true;
 
-    // Always fold no-op FP casts.
-    if (N1VT == N1Op0VT)
-      return true;
+  // Do not optimize out type conversion of f128 type yet.
+  // For some targets like x86_64, configuration is changed to keep one f128
+  // value in one SSE register, but instruction selection cannot handle
+  // FCOPYSIGN on SSE registers yet.
+  if (YTy == MVT::f128)
+    return false;
 
-    // Do not optimize out type conversion of f128 type yet.
-    // For some targets like x86_64, configuration is changed to keep one f128
-    // value in one SSE register, but instruction selection cannot handle
-    // FCOPYSIGN on SSE registers yet.
-    if (N1Op0VT == MVT::f128)
-      return false;
+  return !YTy.isVector() || EnableVectorFCopySignExtendRound;
+}
 
-    return !N1Op0VT.isVector() || EnableVectorFCopySignExtendRound;
-  }
-  return false;
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::FP_EXTEND &&
+      N1.getOpcode() != ISD::FP_ROUND)
+    return false;
+  EVT N1VT = N1->getValueType(0);
+  EVT N1Op0VT = N1->getOperand(0).getValueType();
+  return CanCombineFCOPYSIGN_EXTEND_ROUND(N1VT, N1Op0VT);
 }
 
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
@@ -16991,7 +16994,13 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse()) {
+  // Note: From a legality perspective, this is a two step transform.  First,
+  // we duplicate the fp_round to the arguments of the copysign, then we
+  // eliminate the fp_round on Y.  The second step requires an additional
+  // predicate to match the implementation above.
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
+      CanCombineFCOPYSIGN_EXTEND_ROUND(VT,
+                                       N0.getValueType())) {
     SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
     AddToWorklist(Tmp.getNode());

diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
@@ -259,30 +259,48 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 ;========== FCOPYSIGN_EXTEND_ROUND
 
 define <vscale x 4 x half> @test_copysign_nxv4f32_nxv4f16(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
-; CHECK-LABEL: test_copysign_nxv4f32_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
-; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
-; CHECK-NEXT:    and z1.h, z1.h, #0x8000
-; CHECK-NEXT:    and z0.h, z0.h, #0x7fff
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv4f32_nxv4f16:
+; CHECK-NO-EXTEND-ROUND:       // %bb.0:
+; CHECK-NO-EXTEND-ROUND-NEXT:    and z1.s, z1.s, #0x80000000
+; CHECK-NO-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
+; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.s
+; CHECK-NO-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NO-EXTEND-ROUND-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NO-EXTEND-ROUND-NEXT:    ret
+;
+; CHECK-EXTEND-ROUND-LABEL: test_copysign_nxv4f32_nxv4f16:
+; CHECK-EXTEND-ROUND:       // %bb.0:
+; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.s
+; CHECK-EXTEND-ROUND-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-EXTEND-ROUND-NEXT:    and z1.h, z1.h, #0x8000
+; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
+; CHECK-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-EXTEND-ROUND-NEXT:    ret
   %t1 = call <vscale x 4 x float> @llvm.copysign.v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   %t2 = fptrunc <vscale x 4 x float> %t1 to <vscale x 4 x half>
   ret <vscale x 4 x half> %t2
 }
 
 define <vscale x 2 x float> @test_copysign_nxv2f64_nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
-; CHECK-LABEL: test_copysign_nxv2f64_nxv2f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
-; CHECK-NEXT:    and z1.s, z1.s, #0x80000000
-; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv2f64_nxv2f32:
+; CHECK-NO-EXTEND-ROUND:       // %bb.0:
+; CHECK-NO-EXTEND-ROUND-NEXT:    and z1.d, z1.d, #0x8000000000000000
+; CHECK-NO-EXTEND-ROUND-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
+; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.d
+; CHECK-NO-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NO-EXTEND-ROUND-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NO-EXTEND-ROUND-NEXT:    ret
+;
+; CHECK-EXTEND-ROUND-LABEL: test_copysign_nxv2f64_nxv2f32:
+; CHECK-EXTEND-ROUND:       // %bb.0:
+; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.d
+; CHECK-EXTEND-ROUND-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-EXTEND-ROUND-NEXT:    and z1.s, z1.s, #0x80000000
+; CHECK-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
+; CHECK-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-EXTEND-ROUND-NEXT:    ret
   %t1 = call <vscale x 2 x double> @llvm.copysign.v2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   %t2 = fptrunc <vscale x 2 x double> %t1 to <vscale x 2 x float>
   ret <vscale x 2 x float> %t2

diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
@@ -1431,3 +1431,16 @@ define <vscale x 8 x double> @vfcopynsign_exttrunc_vf_nxv8f64_nxv8f32(<vscale x
   %r = call <vscale x 8 x double> @llvm.copysign.nxv8f64(<vscale x 8 x double> %vm, <vscale x 8 x double> %eneg)
   ret <vscale x 8 x double> %r
 }
+
+define <vscale x 2 x float> @fptrunc_of_copysign_nxv2f32_nxv2f64(<vscale x 2 x double> %X, <vscale x 2 x double> %Y) {
+; CHECK-LABEL: fptrunc_of_copysign_nxv2f32_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfncvt.f.f.w v8, v10
+; CHECK-NEXT:    ret
+  %copy = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> %X, <vscale x 2 x double> %Y)
+  %trunc = fptrunc <vscale x 2 x double> %copy to <vscale x 2 x float>
+  ret <vscale x 2 x float> %trunc
+}