Skip to content

Commit

Permalink
[LoopVectorize] Change PredicatedBBsAfterVectorization to be per VF
Browse files Browse the repository at this point in the history
When calculating the cost of Instruction::Br in getInstructionCost
we query PredicatedBBsAfterVectorization to see if there is a
scalar predicated block. However, this meant that the decisions
being made for a given fixed-width VF were affecting the cost for a
scalable VF. As a result we were returning InstructionCost::Invalid
pointlessly for a scalable VF that should have a low cost. I
encountered this for some loops when enabling tail-folding for
scalable VFs.

Test added here:

  Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll

Differential Revision: https://reviews.llvm.org/D128272
  • Loading branch information
david-arm committed Jul 12, 2022
1 parent ded6241 commit 6b694d6
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 4 deletions.
11 changes: 7 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -1677,7 +1677,8 @@ class LoopVectorizationCostModel {

/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
PredicatedBBsAfterVectorization;

/// Records whether it is allowed to have the original scalar loop execute at
/// least once. This may be needed as a fallback loop in case runtime
Expand Down Expand Up @@ -6088,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// map will indicate that we've analyzed it already.
ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

PredicatedBBsAfterVectorization[VF].clear();

// Find all the instructions that are scalar with predication in the loop and
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
Expand All @@ -6105,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
PredicatedBBsAfterVectorization.insert(BB);
PredicatedBBsAfterVectorization[VF].insert(BB);
}
}
}
Expand Down Expand Up @@ -6970,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
if (VF.isVector() && BI->isConditional() &&
(PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
(PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
ScalarPredicatedBB = true;

if (ScalarPredicatedBB) {
Expand Down
@@ -0,0 +1,35 @@
; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize <%s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

; The uniform load of %d in the following loop triggers the special
; branch costing code in LoopVectorizationCostModel::getInstructionCost.
; However, this should only affect the fixed-width cost because for
; NEON it needs to scalarize the load, whereas for SVE it can use a predicated load.
; Because of how the LoopVectorizer annotates the load to need scalarization with
; predicated blocks, this leads to different costs for the branch instruction.
;
; NOTE: This test assumes we will never use a fixed-width VF due to
; the high cost of scalarizing the masked store, however this assumption may
; break in future if we permit the use of SVE loads and stores to perform the
; fixed-width operations.
define i32 @uniform_load(i64 %n, ptr readnone %c, ptr %d) #0 {
; CHECK-LABEL: @uniform_load(
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
%load2 = load float, ptr %d, align 4
%arrayidx2 = getelementptr inbounds float, ptr %c, i64 %indvars.iv
store float %load2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.end, label %for.body

for.end: ; preds = %for.body
ret i32 0
}

attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

0 comments on commit 6b694d6

Please sign in to comment.