Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LoopVectorize] Change PredicatedBBsAfterVectorization to be per VF
When calculating the cost of Instruction::Br in getInstructionCost we query PredicatedBBsAfterVectorization to see if there is a scalar predicated block. However, this meant that the decisions being made for a given fixed-width VF were affecting the cost for a scalable VF. As a result we were returning InstructionCost::Invalid pointlessly for a scalable VF that should have a low cost. I encountered this for some loops when enabling tail-folding for scalable VFs. Test added here: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Differential Revision: https://reviews.llvm.org/D128272
- Loading branch information
Showing
2 changed files
with
42 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
35 changes: 35 additions & 0 deletions
35
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize <%s | FileCheck %s | ||
|
||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
; The uniform load of %d in the following loop triggers the special | ||
; branch costing code in LoopVectorizationCostModel::getInstructionCost. | ||
; However, this should only affect the fixed-width cost because for | ||
; NEON it needs to scalarize the load, whereas for SVE it can use a predicated load. | ||
; Because of how the LoopVectorizer annotates the load to need scalarization with | ||
; predicated blocks, this leads to different costs for the branch instruction. | ||
; | ||
; NOTE: This test assumes we will never use a fixed-width VF due to | ||
; the high cost of scalarizing the masked store, however this assumption may | ||
; break in future if we permit the use of SVE loads and stores to perform the | ||
; fixed-width operations. | ||
define i32 @uniform_load(i64 %n, ptr readnone %c, ptr %d) #0 { | ||
; CHECK-LABEL: @uniform_load( | ||
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> | ||
entry: | ||
br label %for.body | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ] | ||
%load2 = load float, ptr %d, align 4 | ||
%arrayidx2 = getelementptr inbounds float, ptr %c, i64 %indvars.iv | ||
store float %load2, ptr %arrayidx2, align 4 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond.not = icmp eq i64 %indvars.iv.next, %n | ||
br i1 %exitcond.not, label %for.end, label %for.body | ||
|
||
for.end: ; preds = %for.body | ||
ret i32 0 | ||
} | ||
|
||
attributes #0 = { vscale_range(1,16) "target-features"="+sve" } |