[SLP] Initial vectorization of non-power-of-2 ops. (#77790)

This patch enables vectorization for non-power-of-2 VFs. Initially only VFs where adding 1 makes the VF a power-of-2, i.e. we can still make relatively effective use of the vectors. It relies on the existing target cost-models to return accurate costs for non-power-of-2 vectors. I checked mostly AArch64 and X86 and there the costs seem reasonable for the costs I checked, although I expect there will be a need to refine both the cost-models and lowering to make most effective use of non-power-of-2 SLP vectorization. Note that re-ordering and shuffling is not implemented for nodes requiring padding yet to keep the initial implementation simpler. The feature is guarded by a new flag, off by defaul for now. PR: #77790
llvm · Apr 13, 2024 · 6d66db3 · 6d66db3
1 parent df9c00b
commit 6d66db3
Show file tree

Hide file tree

Showing 11 changed files with 947 additions and 499 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -190,6 +190,10 @@ static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
+static cl::opt<bool> VectorizeNonPowerOf2(
+    "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
+    cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -2829,6 +2833,14 @@ class BoUpSLP {
                           SmallVectorImpl<Value *> *OpScalars = nullptr,
                           SmallVectorImpl<Value *> *AltScalars = nullptr) const;
 
+    /// Return true if this is a non-power-of-2 node.
+    bool isNonPowOf2Vec() const {
+      bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
+      assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
+             "Reshuffling not supported with non-power-of-2 vectors yet.");
+      return IsNonPowerOf2;
+    }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -2994,9 +3006,11 @@ class BoUpSLP {
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
-
+      assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+             "Reordering isn't implemented for non-power-of-2 nodes yet");
+    }
     return Last;
   }
 
@@ -4256,6 +4270,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
+  // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
+  if (!Order.empty() && !isPowerOf2_32(VL.size())) {
+    assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
+                                   "supported with VectorizeNonPowerOf2");
+    return LoadsState::Gather;
+  }
+
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
       TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
@@ -4575,6 +4596,10 @@ static bool areTwoInsertFromSameBuildVector(
 
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+  // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
+  if (TE.isNonPowOf2Vec())
+    return std::nullopt;
+
   // No need to reorder if need to shuffle reuses, still need to shuffle the
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
@@ -5145,6 +5170,10 @@ bool BoUpSLP::canReorderOperands(
     TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
     ArrayRef<TreeEntry *> ReorderableGathers,
     SmallVectorImpl<TreeEntry *> &GatherOps) {
+  // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
+  if (UserTE->isNonPowOf2Vec())
+    return false;
+
   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
     if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
           return OpData.first == I &&
@@ -5318,6 +5347,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
         const auto AllowsReordering = [&](const TreeEntry *TE) {
+          // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
+          if (TE->isNonPowOf2Vec())
+            return false;
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
               (IgnoreReorder && TE->Idx == 0))
@@ -5944,6 +5976,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
     bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
+    if (!isPowerOf2_32(VL.size()))
+      return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
     LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -6258,6 +6293,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (NumUniqueScalarValues == VL.size()) {
       ReuseShuffleIndicies.clear();
     } else {
+      // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
+      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+        LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
+                             "for nodes with padding.\n");
+        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+        return false;
+      }
       LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
       if (NumUniqueScalarValues <= 1 ||
           (UniquePositions.size() == 1 && all_of(UniqueValues,
@@ -7868,7 +7910,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
           if (VectorizedLoads.contains(VL[I]))
             continue;
-          GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
+          GatherCost +=
+              getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
         }
         // Exclude potentially vectorized loads from list of gathered
         // scalars.
@@ -10678,6 +10721,9 @@ BoUpSLP::isGatherShuffledEntry(
   // No need to check for the topmost gather node.
   if (TE == VectorizableTree.front().get())
     return {};
+  // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
+  if (TE->isNonPowOf2Vec())
+    return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert(TE->UserTreeIndices.size() == 1 &&
          "Expected only single user of the gather node.");
@@ -14995,8 +15041,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
   unsigned VF = Chain.size();
 
-  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
-    return false;
+  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
+    // Check if vectorizing with a non-power-of-2 VF should be considered. At
+    // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
+    // all vector lanes are used.
+    if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
+      return false;
+  }
 
   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
                     << "\n");
@@ -15095,14 +15146,22 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         continue;
       }
 
+      unsigned NonPowerOf2VF = 0;
+      if (VectorizeNonPowerOf2) {
+        // First try vectorizing with a non-power-of-2 VF. At the moment, only
+        // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
+        // lanes are used.
+        unsigned CandVF = Operands.size();
+        if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
+          NonPowerOf2VF = CandVF;
+      }
+
       unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
-      SmallVector<unsigned> CandidateVFs(Sz);
-      // FIXME: Is division-by-2 the correct step? Should we assert that the
-      // register size is a power-of-2?
-      unsigned Size = MaxVF;
-      for_each(CandidateVFs, [&](unsigned &VF) {
-        VF = Size;
-        Size /= 2;
+      SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
+      unsigned Size = MinVF;
+      for_each(reverse(CandidateVFs), [&](unsigned &VF) {
+        VF = Size > MaxVF ? NonPowerOf2VF : Size;
+        Size *= 2;
       });
       unsigned StartIdx = 0;
       for (unsigned Size : CandidateVFs) {

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
@@ -1,35 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s
 
 define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
-; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT:    store <8 x i8> [[TMP1]], ptr [[DST]], align 1
-; CHECK-NEXT:    [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
-; CHECK-NEXT:    [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT:    store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
-; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
-; CHECK-NEXT:    [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
-; CHECK-NEXT:    [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
-; CHECK-NEXT:    [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
-; CHECK-NEXT:    store i8 [[MUL_12]], ptr [[DST_12]], align 1
-; CHECK-NEXT:    [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
-; CHECK-NEXT:    [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
-; CHECK-NEXT:    [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
-; CHECK-NEXT:    [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
-; CHECK-NEXT:    store i8 [[MUL_13]], ptr [[DST_13]], align 1
-; CHECK-NEXT:    [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
-; CHECK-NEXT:    [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
-; CHECK-NEXT:    [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
-; CHECK-NEXT:    [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
-; CHECK-NEXT:    store i8 [[MUL_14]], ptr [[DST_14]], align 1
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store(
+; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+; NON-POW2-NEXT:    store <15 x i8> [[TMP1]], ptr [[DST]], align 1
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store(
+; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+; POW2-ONLY-NEXT:    store <8 x i8> [[TMP1]], ptr [[DST]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
+; POW2-ONLY-NEXT:    [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
+; POW2-ONLY-NEXT:    store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
+; POW2-ONLY-NEXT:    [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
+; POW2-ONLY-NEXT:    [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
+; POW2-ONLY-NEXT:    [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
+; POW2-ONLY-NEXT:    store i8 [[MUL_12]], ptr [[DST_12]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
+; POW2-ONLY-NEXT:    [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
+; POW2-ONLY-NEXT:    [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
+; POW2-ONLY-NEXT:    [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
+; POW2-ONLY-NEXT:    store i8 [[MUL_13]], ptr [[DST_13]], align 1
+; POW2-ONLY-NEXT:    [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
+; POW2-ONLY-NEXT:    [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
+; POW2-ONLY-NEXT:    [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
+; POW2-ONLY-NEXT:    [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
+; POW2-ONLY-NEXT:    store i8 [[MUL_14]], ptr [[DST_14]], align 1
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
@@ -123,5 +133,3 @@ entry:
 
   ret void
 }
-
-