[LV] Build and cost VPlans for scalable VFs.

This patch uses the calculated maximum scalable VFs to build VPlans, cost them and select a suitable scalable VF. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D98722
llvm · Jun 2, 2021 · d41cb6b · d41cb6b
1 parent 8c363ef
commit d41cb6b
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 22 deletions.
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -144,6 +144,13 @@ class LoopVectorizeHints {
     return (ForceKind)Force.Value;
   }
 
+  /// \return true if the cost-model for scalable vectorization should
+  /// favor vectorization with scalable vectors over fixed-width vectors when
+  /// the cost-model is inconclusive.
+  bool isScalableVectorizationPreferred() const {
+    return Scalable.Value == SK_PreferScalable;
+  }
+
   /// \return true if scalable vectorization has been explicitly enabled.
   bool isScalableVectorizationExplicitlyEnabled() const {
     return Scalable.Value == SK_PreferFixedWidth ||

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -70,6 +70,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -1204,6 +1205,16 @@ enum ScalarEpilogueLowering {
   CM_ScalarEpilogueNotAllowedUsePredicate
 };
 
+/// ElementCountComparator creates a total ordering for ElementCount
+/// for the purposes of using it in a set structure.
+struct ElementCountComparator {
+  bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
+    return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
+           std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
+  }
+};
+using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
 /// In many cases vectorization is not profitable. This can happen because of
@@ -1236,10 +1247,12 @@ class LoopVectorizationCostModel {
   bool runtimeChecksRequired();
 
   /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+  /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+  VectorizationFactor
+  selectVectorizationFactor(const ElementCountSet &CandidateVFs);
+
   VectorizationFactor
   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
                                     const LoopVectorizationPlanner &LVP);
@@ -6013,37 +6026,45 @@ bool LoopVectorizationCostModel::isMoreProfitable(
     return RTCostA < RTCostB;
   }
 
+  // When set to preferred, for now assume vscale may be larger than 1, so
+  // that scalable vectorization is slightly favorable over fixed-width
+  // vectorization.
+  if (Hints->isScalableVectorizationPreferred())
+    if (A.Width.isScalable() && !B.Width.isScalable())
+      return (CostA * B.Width.getKnownMinValue()) <=
+             (CostB * A.Width.getKnownMinValue());
+
   // To avoid the need for FP division:
   //      (CostA / A.Width) < (CostB / B.Width)
   // <=>  (CostA * B.Width) < (CostB * A.Width)
   return (CostA * B.Width.getKnownMinValue()) <
          (CostB * A.Width.getKnownMinValue());
 }
 
-VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
-  // FIXME: This can be fixed for scalable vectors later, because at this stage
-  // the LoopVectorizer will only consider vectorizing a loop with scalable
-  // vectors when the loop has a hint to enable vectorization for a given VF.
-  assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
-
+VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
+    const ElementCountSet &VFCandidates) {
   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
+  assert(VFCandidates.count(ElementCount::getFixed(1)) &&
+         "Expected Scalar VF to be a candidate");
 
   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
   VectorizationFactor ChosenFactor = ScalarCost;
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
-  if (ForceVectorization && MaxVF.isVector()) {
+  if (ForceVectorization && VFCandidates.size() > 1) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
     ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max();
   }
 
-  for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
-       i *= 2) {
+  for (const auto &i : VFCandidates) {
+    // The cost for scalar VF=1 is already calculated, so ignore it.
+    if (i.isScalar())
+      continue;
+
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
@@ -6053,7 +6074,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
     VectorizationFactor Candidate(i, C.first);
     LLVM_DEBUG(
         dbgs() << "LV: Vector loop of width " << i << " costs: "
-               << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue())
+               << (*Candidate.Cost.getValue() /
+                   Candidate.Width.getKnownMinValue())
+               << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
                << ".\n");
 
     if (!C.second && !ForceVectorization) {
@@ -7967,17 +7990,21 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     // profitable to scalarize.
     CM.selectUserVectorizationFactor(UserVF);
     CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes({UserVF}, {UserVF});
+    buildVPlansWithVPRecipes(UserVF, UserVF);
     LLVM_DEBUG(printPlans(dbgs()));
     return {{UserVF, 0}};
   }
 
-  ElementCount MaxVF = MaxFactors.FixedVF;
-  assert(!MaxVF.isScalable() &&
-         "Scalable vectors not yet supported beyond this point");
+  // Populate the set of Vectorization Factor Candidates.
+  ElementCountSet VFCandidates;
+  for (auto VF = ElementCount::getFixed(1);
+       ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
+    VFCandidates.insert(VF);
+  for (auto VF = ElementCount::getScalable(1);
+       ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
+    VFCandidates.insert(VF);
 
-  for (ElementCount VF = ElementCount::getFixed(1);
-       ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
+  for (const auto VF : VFCandidates) {
     // Collect Uniform and Scalar instructions after vectorization with VF.
     CM.collectUniformsAndScalars(VF);
 
@@ -7988,14 +8015,15 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   }
 
   CM.collectInLoopReductions();
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
 
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
   LLVM_DEBUG(printPlans(dbgs()));
   if (!MaxFactors.hasVector())
     return VectorizationFactor::Disabled();
 
   // Select the optimal vectorization factor.
-  auto SelectedVF = CM.selectVectorizationFactor(MaxVF);
+  auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
 
   // Check if it is profitable to vectorize with runtime checks.
   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -41,7 +41,7 @@ if.then:                                          ; preds = %for.body
 for.inc:                                          ; preds = %for.body, %if.then
   %inc = add nuw nsw i64 %i.07, 1
   %exitcond.not = icmp eq i64 %inc, %n
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5
 }
 
 
@@ -90,3 +90,5 @@ attributes #0 = { "target-features"="+neon,+sve" }
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
 !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 !4 = !{!"llvm.loop.vectorize.enable", i1 true}
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -10,9 +10,13 @@
 define void @test0(i32* %a, i8* %b, i32* %c) {
 ; CHECK: LV: Checking a loop in "test0"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 16
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: vscale x 16
 entry:
   br label %loop
 
@@ -39,9 +43,13 @@ exit:
 define void @test1(i32* %a, i8* %b) {
 ; CHECK: LV: Checking a loop in "test1"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
 entry:
   br label %loop
 
@@ -69,9 +77,13 @@ exit:
 define void @test2(i32* %a, i8* %b) {
 ; CHECK: LV: Checking a loop in "test2"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
 entry:
   br label %loop
 
@@ -99,9 +111,13 @@ exit:
 define void @test3(i32* %a, i8* %b) {
 ; CHECK: LV: Checking a loop in "test3"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
 entry:
   br label %loop
 
@@ -129,9 +145,13 @@ exit:
 define void @test4(i32* %a, i32* %b) {
 ; CHECK: LV: Checking a loop in "test4"
 ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED_MAXBW-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 4
 entry:
   br label %loop