Skip to content

Commit

Permalink
[LV] Don't optimize exit cond during epilogue vectorization.
Browse files Browse the repository at this point in the history
At the moment, the same VPlan can be used code generation of both the
main vector and epilogue vector loop. This can lead to wrong results, if
the plan is optimized based on the VF of the main vector loop and then
re-used for the epilogue loop.

One example where this is problematic is if the scalar loops need to
execute at least one iteration, e.g. due to interleave groups.

To prevent mis-compiles in the short-term, disable optimizing exit
conditions for VPlans when using epilogue vectorization. The proper fix
is to avoid re-using the same plan for both loops, which will require
support for cloning plans first.

Fixes #56319.
  • Loading branch information
fhahn committed Jul 1, 2022
1 parent b15b142 commit 0dddf04
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 26 deletions.
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Expand Up @@ -303,8 +303,12 @@ class LoopVectorizationPlanner {

/// Generate the IR code for the body of the vectorized loop according to the
/// best selected \p VF, \p UF and VPlan \p BestPlan.
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
/// vectorization re-using plans for both the main and epilogue vector loops.
/// It should be removed once the re-use issue has been fixed.
void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
InnerLoopVectorizer &LB, DominatorTree *DT);
InnerLoopVectorizer &LB, DominatorTree *DT,
bool IsEpilogueVectorization);

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printPlans(raw_ostream &O);
Expand Down
16 changes: 9 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -7520,7 +7520,8 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
VPlan &BestVPlan,
InnerLoopVectorizer &ILV,
DominatorTree *DT) {
DominatorTree *DT,
bool IsEpilogueVectorization) {
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');

Expand Down Expand Up @@ -7564,7 +7565,8 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 2. Copy and widen instructions from the old loop into the new loop.
BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
ILV.getOrCreateVectorTripCount(nullptr),
CanonicalIVStartValue, State);
CanonicalIVStartValue, State,
IsEpilogueVectorization);

BestVPlan.execute(&State);

Expand Down Expand Up @@ -10155,7 +10157,7 @@ static bool processLoopInVPlanNativePath(
&CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}

// Mark the loop as already vectorized to avoid vectorizing again.
Expand Down Expand Up @@ -10499,7 +10501,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&CM, BFI, PSI, Checks);

VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);

ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
Expand All @@ -10524,7 +10526,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
DT);
DT, true);
++LoopsVectorized;

// Second pass vectorizes the epilogue and adjusts the control flow
Expand Down Expand Up @@ -10553,7 +10555,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}

LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT);
DT, true);
++LoopsEpilogueVectorized;

if (!MainILV.areSafetyChecksAdded())
Expand All @@ -10563,7 +10565,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&LVL, &CM, BFI, PSI, Checks);

VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;

// Add metadata to disable runtime unrolling a scalar loop when there
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Expand Up @@ -546,13 +546,14 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,

void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
VPTransformState &State) {
VPTransformState &State,
bool IsEpilogueVectorization) {

VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
// Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
// preparing to execute the plan for the main vector loop.
if (!CanonicalIVStartValue && Term &&
if (!IsEpilogueVectorization && Term &&
Term->getOpcode() == VPInstruction::BranchOnCount &&
isa<ConstantInt>(TripCountV)) {
ConstantInt *C = cast<ConstantInt>(TripCountV);
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Expand Up @@ -2511,7 +2511,8 @@ class VPlan {

/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
Value *CanonicalIVStartValue, VPTransformState &State);
Value *CanonicalIVStartValue, VPTransformState &State,
bool IsEpilogueVectorization);

/// Generate the IR code for this VPlan.
void execute(struct VPTransformState *State);
Expand Down
20 changes: 14 additions & 6 deletions llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
Expand Up @@ -9,18 +9,26 @@
define zeroext i8 @sum() {
; CHECK-LABEL: @sum(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 0
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 64
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 16
; CHECK-NEXT: [[TMP4:%.*]] = add <64 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = add <64 x i8> [[WIDE_LOAD2]], zeroinitializer
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 128
; CHECK-NEXT: [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
; CHECK-NEXT: ret i8 [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
; CHECK-NEXT: ret i8 [[TMP7]]
;
entry:
br label %for.body
Expand Down
Expand Up @@ -26,7 +26,8 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
; CHECK-NEXT: store <32 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
Expand All @@ -36,16 +37,17 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP5]], i64 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP7]], align 1
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP6]], i64 0
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP8]], align 1
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC2]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP9]], align 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 24
; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
Expand Down

0 comments on commit 0dddf04

Please sign in to comment.