Skip to content

Commit

Permalink
[LV] Build and cost VPlans for scalable VFs.
Browse files Browse the repository at this point in the history
This patch uses the calculated maximum scalable VFs to build VPlans,
cost them and select a suitable scalable VF.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D98722
  • Loading branch information
sdesmalen-arm committed Jun 2, 2021
1 parent 8c363ef commit d41cb6b
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ class LoopVectorizeHints {
return (ForceKind)Force.Value;
}

/// \return true if the cost-model for scalable vectorization should
/// favor vectorization with scalable vectors over fixed-width vectors when
/// the cost-model is inconclusive.
bool isScalableVectorizationPreferred() const {
return Scalable.Value == SK_PreferScalable;
}

/// \return true if scalable vectorization has been explicitly enabled.
bool isScalableVectorizationExplicitlyEnabled() const {
return Scalable.Value == SK_PreferFixedWidth ||
Expand Down
70 changes: 49 additions & 21 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
Expand Down Expand Up @@ -1204,6 +1205,16 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedUsePredicate
};

/// ElementCountComparator creates a total ordering for ElementCount
/// for the purposes of using it in a set structure.
struct ElementCountComparator {
bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
}
};
using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;

/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
Expand Down Expand Up @@ -1236,10 +1247,12 @@ class LoopVectorizationCostModel {
bool runtimeChecksRequired();

/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
VectorizationFactor
selectVectorizationFactor(const ElementCountSet &CandidateVFs);

VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MaxVF,
const LoopVectorizationPlanner &LVP);
Expand Down Expand Up @@ -6013,37 +6026,45 @@ bool LoopVectorizationCostModel::isMoreProfitable(
return RTCostA < RTCostB;
}

// When set to preferred, for now assume vscale may be larger than 1, so
// that scalable vectorization is slightly favorable over fixed-width
// vectorization.
if (Hints->isScalableVectorizationPreferred())
if (A.Width.isScalable() && !B.Width.isScalable())
return (CostA * B.Width.getKnownMinValue()) <=
(CostB * A.Width.getKnownMinValue());

// To avoid the need for FP division:
// (CostA / A.Width) < (CostB / B.Width)
// <=> (CostA * B.Width) < (CostB * A.Width)
return (CostA * B.Width.getKnownMinValue()) <
(CostB * A.Width.getKnownMinValue());
}

VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
// FIXME: This can be fixed for scalable vectors later, because at this stage
// the LoopVectorizer will only consider vectorizing a loop with scalable
// vectors when the loop has a hint to enable vectorization for a given VF.
assert(!MaxVF.isScalable() && "scalable vectors not yet supported");

VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
const ElementCountSet &VFCandidates) {
InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
assert(VFCandidates.count(ElementCount::getFixed(1)) &&
"Expected Scalar VF to be a candidate");

const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;

bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && MaxVF.isVector()) {
if (ForceVectorization && VFCandidates.size() > 1) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max();
}

for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
i *= 2) {
for (const auto &i : VFCandidates) {
// The cost for scalar VF=1 is already calculated, so ignore it.
if (i.isScalar())
continue;

// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
Expand All @@ -6053,7 +6074,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
VectorizationFactor Candidate(i, C.first);
LLVM_DEBUG(
dbgs() << "LV: Vector loop of width " << i << " costs: "
<< (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue())
<< (*Candidate.Cost.getValue() /
Candidate.Width.getKnownMinValue())
<< (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
<< ".\n");

if (!C.second && !ForceVectorization) {
Expand Down Expand Up @@ -7967,17 +7990,21 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
CM.collectInLoopReductions();
buildVPlansWithVPRecipes({UserVF}, {UserVF});
buildVPlansWithVPRecipes(UserVF, UserVF);
LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0}};
}

ElementCount MaxVF = MaxFactors.FixedVF;
assert(!MaxVF.isScalable() &&
"Scalable vectors not yet supported beyond this point");
// Populate the set of Vectorization Factor Candidates.
ElementCountSet VFCandidates;
for (auto VF = ElementCount::getFixed(1);
ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
VFCandidates.insert(VF);
for (auto VF = ElementCount::getScalable(1);
ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
VFCandidates.insert(VF);

for (ElementCount VF = ElementCount::getFixed(1);
ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
for (const auto VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);

Expand All @@ -7988,14 +8015,15 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
}

CM.collectInLoopReductions();
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);

buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
if (!MaxFactors.hasVector())
return VectorizationFactor::Disabled();

// Select the optimal vectorization factor.
auto SelectedVF = CM.selectVectorizationFactor(MaxVF);
auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);

// Check if it is profitable to vectorize with runtime checks.
unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
Expand Down
4 changes: 3 additions & 1 deletion llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ if.then: ; preds = %for.body
for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i64 %i.07, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5
}


Expand Down Expand Up @@ -90,3 +90,5 @@ attributes #0 = { "target-features"="+neon,+sve" }
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
!5 = distinct !{!5, !6}
!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@
define void @test0(i32* %a, i8* %b, i32* %c) {
; CHECK: LV: Checking a loop in "test0"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 16
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: vscale x 16
entry:
br label %loop

Expand All @@ -39,9 +43,13 @@ exit:
define void @test1(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test1"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
entry:
br label %loop

Expand Down Expand Up @@ -69,9 +77,13 @@ exit:
define void @test2(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test2"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
entry:
br label %loop

Expand Down Expand Up @@ -99,9 +111,13 @@ exit:
define void @test3(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test3"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
entry:
br label %loop

Expand Down Expand Up @@ -129,9 +145,13 @@ exit:
define void @test4(i32* %a, i32* %b) {
; CHECK: LV: Checking a loop in "test4"
; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 4
entry:
br label %loop

Expand Down

0 comments on commit d41cb6b

Please sign in to comment.