[x86, SSE/AVX] allow 128/256-bit lowering for copysign vector intrins…

…ics (PR30433) This should fix: https://llvm.org/bugs/show_bug.cgi?id=30433 There are a couple of open questions about the codegen: 1. Should we let scalar ops be scalars and avoid vector constant loads/splats? 2. Should we have a pass to combine constants such as the inverted pair that we have here? Differential Revision: https://reviews.llvm.org/D25165 llvm-svn: 283119
llvm · Oct 3, 2016 · d27a218 · d27a218
1 parent 45e8ba8
commit d27a218
Show file tree

Hide file tree

Showing 4 changed files with 260 additions and 451 deletions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -730,6 +730,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
@@ -765,6 +766,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
@@ -980,6 +982,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::FNEG,       VT, Custom);
       setOperationAction(ISD::FABS,       VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
     }
 
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
@@ -14662,31 +14665,39 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
   bool IsF128 = (VT == MVT::f128);
-  assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v8f32) &&
          "Unexpected type in LowerFCOPYSIGN");
 
+  MVT EltVT = VT.getScalarType();
   const fltSemantics &Sem =
-      VT == MVT::f64 ? APFloat::IEEEdouble :
-          (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
-  const unsigned SizeInBits = VT.getSizeInBits();
+      EltVT == MVT::f64 ? APFloat::IEEEdouble
+                        : (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
 
-  // Perform all logic operations as 16-byte vectors because there are no
+  // Perform all scalar logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE.
-  MVT LogicVT =
-      (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
+  // TODO: This isn't necessary. If we used scalar types, we might avoid some
+  // unnecessary splats, but we might miss load folding opportunities. Should
+  // this decision be based on OptimizeForSize?
+  bool IsFakeVector = !VT.isVector() && !IsF128;
+  MVT LogicVT = VT;
+  if (IsFakeVector)
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+
+  // The mask constants are automatically splatted for vector types.
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
   SDValue SignMask = DAG.getConstantFP(
-      APFloat(Sem, APInt::getSignBit(SizeInBits)), dl, LogicVT);
+      APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+  SDValue MagMask = DAG.getConstantFP(
+      APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
 
   // First, clear all bits but the sign bit from the second operand (sign).
-  if (!IsF128)
+  if (IsFakeVector)
     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
 
   // Next, clear the sign bit from the first operand (magnitude).
-  // If it's a constant, we can clear it here.
-  SDValue MagMask = DAG.getConstantFP(
-      APFloat(Sem, ~APInt::getSignBit(SizeInBits)), dl, LogicVT);
-
   // TODO: If we had general constant folding for FP logic ops, this check
   // wouldn't be necessary.
   SDValue MagBits;
@@ -14696,16 +14707,15 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
   } else {
     // If the magnitude operand wasn't a constant, we need to AND out the sign.
-    if (!IsF128)
+    if (IsFakeVector)
       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
   }
 
   // OR the magnitude value with the sign bit.
   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
-  return IsF128 ? Or :
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
-                  DAG.getIntPtrConstant(0, dl));
+  return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+                                          DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -401,22 +401,22 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
   ; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
   %F32 = call float @llvm.copysign.f32(float undef, float undef)
-  ; SSE2: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
-  ; SSE42: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
-  ; AVX: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
-  ; AVX2: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
-  ; AVX512: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; AVX: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
   %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-  ; SSE2: cost of 34 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
-  ; SSE42: cost of 34 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
-  ; AVX: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
-  ; AVX2: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
-  ; AVX512: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; AVX: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
   %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-  ; SSE2: cost of 68 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
-  ; SSE42: cost of 68 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
-  ; AVX: cost of 74 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
-  ; AVX2: cost of 74 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
   ; AVX512: cost of 77 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
   %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
 
@@ -426,22 +426,22 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
   ; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
   %F64 = call double @llvm.copysign.f64(double undef, double undef)
-  ; SSE2: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
-  ; SSE42: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
-  ; AVX: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
-  ; AVX2: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
-  ; AVX512: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; AVX: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
   %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-  ; SSE2: cost of 14 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
-  ; SSE42: cost of 14 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
-  ; AVX: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
-  ; AVX2: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
-  ; AVX512: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; AVX: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
   %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-  ; SSE2: cost of 28 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
-  ; SSE42: cost of 28 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
-  ; AVX: cost of 34 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
-  ; AVX2: cost of 34 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
   ; AVX512: cost of 37 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
   %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)