[TTI] Add DemandedElts to getScalarizationOverhead

The improvements to the x86 vector insert/extract element costs in D74976 resulted in the estimated costs for vector initialization and scalarization increasing higher than should be expected. This is particularly noticeable on pre-SSE4 targets where the available of legal INSERT_VECTOR_ELT ops is more limited. This patch does 2 things: 1 - it implements X86TTIImpl::getScalarizationOverhead to more accurately represent the typical costs of a ISD::BUILD_VECTOR pattern. 2 - it adds a DemandedElts mask to getScalarizationOverhead to permit the SLP's BoUpSLP::getGatherCost to be rewritten to use it directly instead of accumulating raw vector insertion costs. This fixes PR45418 where a v4i8 (zext'd to v4i32) was no longer vectorizing. A future patch should extend X86TTIImpl::getScalarizationOverhead to tweak the EXTRACT_VECTOR_ELT scalarization costs as well. Reviewed By: @craig.topper Differential Revision: https://reviews.llvm.org/D78216
llvm · Apr 29, 2020 · 090cae8 · 090cae8
1 parent 42a56bf
commit 090cae8
Show file tree

Hide file tree

Showing 22 changed files with 502 additions and 398 deletions.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -611,8 +611,15 @@ class TargetTransformInfo {
   ///  should use coldcc calling convention.
   bool useColdCCForColdCall(Function &F) const;
 
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the demanded result elements need to be inserted and/or
+  /// extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract) const;
+
+  /// Estimate the overhead of scalarizing an instructions unique
+  /// non-constant operands. The types of the arguments are ordinarily
+  /// scalar, in which case the costs are multiplied with VF.
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) const;
 
@@ -1231,8 +1238,8 @@ class TargetTransformInfo::Concept {
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool useColdCCForColdCall(Function &F) = 0;
-  virtual unsigned getScalarizationOverhead(Type *Ty, bool Insert,
-                                            bool Extract) = 0;
+  virtual unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                            bool Insert, bool Extract) = 0;
   virtual unsigned
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                    unsigned VF) = 0;
@@ -1556,9 +1563,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.useColdCCForColdCall(F);
   }
 
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert,
-                                    bool Extract) override {
-    return Impl.getScalarizationOverhead(Ty, Insert, Extract);
+  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract) override {
+    return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
   }
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) override {

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -239,7 +239,8 @@ class TargetTransformInfoImplBase {
 
   bool useColdCCForColdCall(Function &F) { return false; }
 
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract) {
     return 0;
   }
 

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -548,12 +548,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+  /// are set if the demanded result elements need to be inserted and/or
+  /// extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract) {
     auto *VTy = cast<VectorType>(Ty);
+    assert(DemandedElts.getBitWidth() == VTy->getNumElements() &&
+           "Vector size mismatch");
+
     unsigned Cost = 0;
 
     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
+      if (!DemandedElts[i])
+        continue;
       if (Insert)
         Cost += static_cast<T *>(this)->getVectorInstrCost(
             Instruction::InsertElement, VTy, i);
@@ -565,6 +572,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return Cost;
   }
 
+  /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+    auto *VTy = cast<VectorType>(Ty);
+    APInt DemandedElts = APInt::getAllOnesValue(VTy->getNumElements());
+    return static_cast<T *>(this)->getScalarizationOverhead(Ty, DemandedElts,
+                                                            Insert, Extract);
+  }
+
   /// Estimate the overhead of scalarizing an instructions unique
   /// non-constant operands. The types of the arguments are ordinarily
   /// scalar, in which case the costs are multiplied with VF.

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -368,9 +368,9 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
   return TTIImpl->useColdCCForColdCall(F);
 }
 
-unsigned TargetTransformInfo::getScalarizationOverhead(Type *Ty, bool Insert,
-                                                       bool Extract) const {
-  return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
+unsigned TargetTransformInfo::getScalarizationOverhead(
+    Type *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const {
+  return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
 }
 
 unsigned TargetTransformInfo::getOperandsScalarizationOverhead(

diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -115,9 +115,10 @@ unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
   return (8 * ST.getVectorLength()) / ElemWidth;
 }
 
-unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
-      bool Extract) {
-  return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
+unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty,
+                                                  const APInt &DemandedElts,
+                                                  bool Insert, bool Extract) {
+  return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
 }
 
 unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(

diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -101,9 +101,10 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
     return true;
   }
 
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
-            unsigned VF);
+  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract);
+  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                            unsigned VF);
   unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Value *> Args, FastMathFlags FMF,

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2868,9 +2868,62 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
-unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
-                                              bool Extract) {
-  return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
+unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
+                                              const APInt &DemandedElts,
+                                              bool Insert, bool Extract) {
+  auto* VecTy = cast<VectorType>(Ty);
+  unsigned Cost = 0;
+
+  // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
+  // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
+  if (Insert) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+    MVT MScalarTy = LT.second.getScalarType();
+
+    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+        (MScalarTy.isInteger() && ST->hasSSE41()) ||
+        (MScalarTy == MVT::f32 && ST->hasSSE41())) {
+      // For types we can insert directly, insertion into 128-bit sub vectors is
+      // cheap, followed by a cheap chain of concatenations.
+      if (LT.second.getSizeInBits() <= 128) {
+        Cost +=
+            BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
+      } else {
+        unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+        Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
+        Cost += DemandedElts.countPopulation();
+
+        // For vXf32 cases, insertion into the 0'th index in each v4f32
+        // 128-bit vector is free.
+        // NOTE: This assumes legalization widens vXf32 vectors.
+        if (MScalarTy == MVT::f32)
+          for (unsigned i = 0, e = VecTy->getNumElements(); i < e; i += 4)
+            if (DemandedElts[i])
+              Cost--;
+      }
+    } else if (LT.second.isVector()) {
+      // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
+      // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
+      // series of UNPCK followed by CONCAT_VECTORS - all of these can be
+      // considered cheap.
+      if (Ty->isIntOrIntVectorTy())
+        Cost += DemandedElts.countPopulation();
+
+      // Get the smaller of the legalized or original pow2-extended number of
+      // vector elements, which represents the number of unpacks we'll end up
+      // performing.
+      unsigned NumElts = LT.second.getVectorNumElements();
+      unsigned Pow2Elts = PowerOf2Ceil(VecTy->getNumElements());
+      Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
+    }
+  }
+
+  // TODO: Use default extraction for now, but we should investigate extending this
+  // to handle repeated subvector extraction.
+  if (Extract)
+    Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+
+  return Cost;
 }
 
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
@@ -2893,9 +2946,11 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
     // Assume that all other non-power-of-two numbers are scalarized.
     if (!isPowerOf2_32(NumElem)) {
+      APInt DemandedElts = APInt::getAllOnesValue(NumElem);
       int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
                                         AddressSpace);
-      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+      int SplitCost = getScalarizationOverhead(Src, DemandedElts,
+                                               Opcode == Instruction::Load,
                                                Opcode == Instruction::Store);
       return NumElem * Cost + SplitCost;
     }
@@ -2935,13 +2990,15 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
       (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
       !isPowerOf2_32(NumElem)) {
     // Scalarization
-    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+    APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+    int MaskSplitCost =
+        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
     int ScalarCompareCost = getCmpSelInstrCost(
         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
     int BranchCost = getCFInstrCost(Instruction::Br);
     int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-
-    int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
+    int ValueSplitCost =
+        getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
     int MemopCost =
         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                          MaybeAlign(Alignment), AddressSpace);
@@ -3795,12 +3852,14 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
                                 bool VariableMask, unsigned Alignment,
                                 unsigned AddressSpace) {
   unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
+  APInt DemandedElts = APInt::getAllOnesValue(VF);
 
   int MaskUnpackCost = 0;
   if (VariableMask) {
     VectorType *MaskTy =
       VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
-    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+    MaskUnpackCost =
+        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
     int ScalarCompareCost =
       getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
                          nullptr);

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,7 +132,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5703,8 +5703,9 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
-      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
-                                                 true, false);
+      ScalarCost +=
+          TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
+                                       APInt::getAllOnesValue(VF), true, false);
       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
     }
 
@@ -5720,7 +5721,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
           Worklist.push_back(J);
         else if (needsExtract(J, VF))
           ScalarCost += TTI.getScalarizationOverhead(
-                              ToVectorTy(J->getType(),VF), false, true);
+              ToVectorTy(J->getType(), VF), APInt::getAllOnesValue(VF), false,
+              true);
       }
 
     // Scale the total scalar cost by block probability.
@@ -6004,7 +6006,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   Type *RetTy = ToVectorTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+    Cost += TTI.getScalarizationOverhead(RetTy, APInt::getAllOnesValue(VF),
+                                         true, false);
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6210,7 +6213,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       // Return cost for branches around scalarized and predicated blocks.
       Type *Vec_i1Ty =
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
-      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
+      return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
+                                           false, true) +
               (TTI.getCFInstrCost(Instruction::Br) * VF));
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
       // The back-edge branch will remain, as will all scalar branches.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3877,10 +3877,13 @@ int BoUpSLP::getTreeCost() {
 
 int BoUpSLP::getGatherCost(VectorType *Ty,
                            const DenseSet<unsigned> &ShuffledIndices) const {
-  int Cost = 0;
-  for (unsigned i = 0, e = Ty->getNumElements(); i < e; ++i)
+  unsigned NumElts = Ty->getNumElements();
+  APInt DemandedElts = APInt::getNullValue(NumElts);
+  for (unsigned i = 0; i < NumElts; ++i)
     if (!ShuffledIndices.count(i))
-      Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      DemandedElts.setBit(i);
+  int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+                                           /*Extract*/ false);
   if (!ShuffledIndices.empty())
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
@@ -7018,6 +7021,34 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
          "Expected insertelement or insertvalue instruction!");
   UserCost = 0;
   do {
+    // TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather
+    // than sum of single inserts as the latter may overestimate cost.
+    // This work should imply improving cost estimation for extracts that
+    // added in for external (for vectorization tree) users.
+    // For example, in following case all extracts added in order to feed
+    // into external users (inserts), which in turn form sequence to build
+    // an aggregate that we do match here:
+    //  %4 = extractelement <4 x i64> %3, i32 0
+    //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+    //  %5 = extractelement <4 x i64> %3, i32 1
+    //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+    //  %6 = extractelement <4 x i64> %3, i32 2
+    //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+    //  %7 = extractelement <4 x i64> %3, i32 3
+    //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+    //
+    // Cost of this entire sequence is currently estimated as sum of single
+    // extracts (as this aggregate build sequence is an external to
+    // vectorization tree user) minus cost of the aggregate build.
+    // As this whole sequence will be optimized away we want the cost to be
+    // zero. But it is not quite possible using given approach (at least for
+    // X86) because inserts can be more expensive than extracts for longer
+    // vector lengths so the difference turns out not zero in such a case.
+    // Ideally we want to match this entire sequence and treat it as a no-op
+    // (i.e. do not count into final cost at all).
+    // Currently the difference tends to be negative thus adding a bias
+    // toward favoring vectorization. If we switch into using TTI interface
+    // the bias tendency will remain but will be lower.
     Value *InsertedOperand;
     if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
       InsertedOperand = IE->getOperand(1);