Skip to content

Commit

Permalink
[VPlan] Model first exit values using VPLiveOut.
Browse files Browse the repository at this point in the history
This patch introduces a new VPLiveOut subclass of VPUser  to model
 exit values explicitly. The initial version handles exit values that
are neither part of induction or reduction chains nor first order
recurrence phis.

Fixes #51366, #54867, #55167, #55459

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D123537
  • Loading branch information
fhahn committed May 21, 2022
1 parent a84896f commit 3bebec6
Show file tree
Hide file tree
Showing 16 changed files with 245 additions and 120 deletions.
98 changes: 50 additions & 48 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -567,7 +567,8 @@ class InnerLoopVectorizer {
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock, BasicBlock *VectorHeader);
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
VPlan &Plan);

/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
Expand All @@ -584,13 +585,6 @@ class InnerLoopVectorizer {
void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
VPTransformState &State);

/// Fixup the LCSSA phi nodes in the unique exit block. This simply
/// means we need to add the appropriate incoming value from the middle
/// block as exiting edges from the scalar epilogue loop (if present) are
/// already in place, and we exit the vector loop exclusively to the middle
/// block.
void fixLCSSAPHIs(VPTransformState &State);

/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
Expand Down Expand Up @@ -3335,7 +3329,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock,
BasicBlock *VectorHeader) {
BasicBlock *VectorHeader, VPlan &Plan) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
Expand Down Expand Up @@ -3395,8 +3389,10 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// In this case, if IV1 has an external use, we need to avoid adding both
// "last value of IV1" and "penultimate value of IV2". So, verify that we
// don't already have an incoming value for the middle block.
if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
PHI->addIncoming(I.second, MiddleBlock);
Plan.removeLiveOut(PHI);
}
}
}

Expand Down Expand Up @@ -3700,20 +3696,30 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
// If we inserted an edge from the middle block to the unique exit block,
// update uses outside the loop (phis) to account for the newly inserted
// edge.
if (!Cost->requiresScalarEpilogue(VF)) {
if (Cost->requiresScalarEpilogue(VF)) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
Plan.clearLiveOuts();
} else {
// If we inserted an edge from the middle block to the unique exit block,
// update uses outside the loop (phis) to account for the newly inserted
// edge.

// Fix-up external users of the induction variables.
for (auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
IVEndValues[Entry.first], LoopMiddleBlock,
VectorLoop->getHeader());

fixLCSSAPHIs(State);
VectorLoop->getHeader(), Plan);
}

// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
// in the exit block, so update the builder.
State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
for (auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);

for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);

Expand Down Expand Up @@ -3862,8 +3868,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(
// and thus no phis which needed updated.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
State.Plan->removeLiveOut(&LCSSAPhi);
}
}

void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
Expand Down Expand Up @@ -4046,8 +4054,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// fixFirstOrderRecurrence for a more complete explaination of the logic.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
State.Plan->removeLiveOut(&LCSSAPhi);
}

// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
Expand Down Expand Up @@ -4092,35 +4102,6 @@ void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
}
}

void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
// Some phis were already hand updated by the reduction and recurrence
// code above, leave them alone.
continue;

auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
// Non-instruction incoming values will have only one value.

VPLane Lane = VPLane::getFirstLane();
if (isa<Instruction>(IncomingValue) &&
!Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
VF))
Lane = VPLane::getLastLaneForVF(VF);

// Can be a loop invariant incoming value or the last scalar value to be
// extracted from the vectorized loop.
// FIXME: Should not rely on getVPValue at this point.
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Value *lastIncomingValue =
OrigLoop->isLoopInvariant(IncomingValue)
? IncomingValue
: State.get(State.Plan->getVPValue(IncomingValue, true),
VPIteration(UF - 1, Lane));
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
}

void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
Expand Down Expand Up @@ -8716,6 +8697,25 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
EB->appendRecipe(BranchOnCount);
}

// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
// original exit block.
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
VPlan &Plan) {
BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
// Only handle single-exit loops with unique exit blocks for now.
if (!ExitBB || !ExitBB->getSinglePredecessor())
return;

// Introduce VPUsers modeling the exit values.
for (PHINode &ExitPhi : ExitBB->phis()) {
Value *IncomingValue =
ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch());
VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
Plan.addLiveOut(&ExitPhi, V);
}
}

VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const MapVector<Instruction *, Instruction *> &SinkAfter) {
Expand Down Expand Up @@ -8895,6 +8895,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// After here, VPBB should not be used.
VPBB = nullptr;

addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);

assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
Expand Down
25 changes: 25 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Expand Up @@ -647,6 +647,15 @@ bool VPRecipeBase::mayHaveSideEffects() const {
}
}

void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
auto Lane = VPLane::getLastLaneForVF(State.VF);
VPValue *ExitValue = getOperand(0);
if (Plan.isUniformAfterVectorization(ExitValue))
Lane = VPLane::getFirstLane();
Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
State.Builder.GetInsertBlock());
}

void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
assert(!Parent && "Recipe already in some VPBasicBlock");
assert(InsertPos->getParent() &&
Expand Down Expand Up @@ -1065,6 +1074,17 @@ void VPlan::print(raw_ostream &O) const {
O << '\n';
Block->print(O, "", SlotTracker);
}

if (!LiveOuts.empty())
O << "\n";
for (auto &KV : LiveOuts) {
O << "Live-out ";
KV.second->getPhi()->printAsOperand(O);
O << " = ";
KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
O << "\n";
}

O << "}\n";
}

Expand All @@ -1078,6 +1098,11 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif

void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
LiveOuts.insert({PN, new VPLiveOut(PN, V)});
}

void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
BasicBlock *LoopLatchBB,
BasicBlock *LoopExitBB) {
Expand Down
48 changes: 48 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Expand Up @@ -664,6 +664,32 @@ class VPBlockBase {
#endif
};

/// A value that is used outside the VPlan. The operand of the user needs to be
/// added to the associated LCSSA phi node.
class VPLiveOut : public VPUser {
PHINode *Phi;

public:
VPLiveOut(PHINode *Phi, VPValue *Op)
: VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}

/// Fixup the wrapped LCSSA phi node in the unique exit block. This simply
/// means we need to add the appropriate incoming value from the middle
/// block as exiting edges from the scalar epilogue loop (if present) are
/// already in place, and we exit the vector loop exclusively to the middle
/// block.
void fixPhi(VPlan &Plan, VPTransformState &State);

/// Returns true if the VPLiveOut uses scalars of operand \p Op.
bool usesScalars(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}

PHINode *getPhi() const { return Phi; }
};

/// VPRecipeBase is a base class modeling a sequence of one or more output IR
/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
/// and is responsible for deleting its defined values. Single-value
Expand Down Expand Up @@ -2483,13 +2509,18 @@ class VPlan {
/// mapping cannot be used any longer, because it is stale.
bool Value2VPValueEnabled = true;

/// Values used outside the plan.
DenseMap<PHINode *, VPLiveOut *> LiveOuts;

public:
VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
if (Entry)
Entry->setPlan(this);
}

~VPlan() {
clearLiveOuts();

if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : depth_first(Entry))
Expand Down Expand Up @@ -2658,6 +2689,23 @@ class VPlan {
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}

void addLiveOut(PHINode *PN, VPValue *V);

void clearLiveOuts() {
for (auto &KV : LiveOuts)
delete KV.second;
LiveOuts.clear();
}

void removeLiveOut(PHINode *PN) {
delete LiveOuts[PN];
LiveOuts.erase(PN);
}

const DenseMap<PHINode *, VPLiveOut *> &getLiveOuts() const {
return LiveOuts;
}

private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
Expand Down
41 changes: 3 additions & 38 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Expand Up @@ -363,50 +363,15 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
}
}

// Check for live-out users currently not modeled in VPlan.
// Note that exit values of inductions are generated independent of
// the recipe. This means VPWidenIntOrFpInductionRecipe &
// VPScalarIVStepsRecipe can be removed, independent of uses outside
// the loop.
// TODO: Remove once live-outs are modeled in VPlan.
static bool hasOutsideUser(Instruction &I, Loop &OrigLoop) {
return any_of(I.users(), [&OrigLoop](User *U) {
if (!OrigLoop.contains(cast<Instruction>(U)))
return true;

// Look through single-value phis in the loop, as they won't be modeled in
// VPlan and may be used outside the loop.
if (auto *PN = dyn_cast<PHINode>(U))
if (PN->getNumIncomingValues() == 1)
return hasOutsideUser(*PN, OrigLoop);

return false;
});
}

void VPlanTransforms::removeDeadRecipes(VPlan &Plan, Loop &OrigLoop) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// Check if \p R is used outside the loop, if required.
// TODO: Remove once live-outs are modeled in VPlan.
auto HasUsersOutsideLoop = [&OrigLoop](VPRecipeBase &R) {
// Exit values for induction recipes are generated independent of the
// recipes, expect for truncated inductions. Hence there is no need to check
// for users outside the loop for them.
if (isa<VPScalarIVStepsRecipe>(&R) ||
(isa<VPWidenIntOrFpInductionRecipe>(&R) &&
!isa<TruncInst>(R.getUnderlyingInstr())))
return false;
return R.getUnderlyingInstr() &&
hasOutsideUser(*R.getUnderlyingInstr(), OrigLoop);
};
// Remove dead recipes in header block. The recipes in the block are processed
// in reverse order, to catch chains of dead recipes.
// TODO: Remove dead recipes across whole plan.
for (VPRecipeBase &R : make_early_inc_range(reverse(*Header))) {
if (R.mayHaveSideEffects() ||
any_of(R.definedValues(),
[](VPValue *V) { return V->getNumUsers() > 0; }) ||
HasUsersOutsideLoop(R))
if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) {
return V->getNumUsers() > 0;
}))
continue;
R.eraseFromParent();
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanValue.h
Expand Up @@ -208,6 +208,7 @@ class VPUser {
/// Subclass identifier (for isa/dyn_cast).
enum class VPUserID {
Recipe,
LiveOut,
// TODO: Currently VPUsers are used in VPBlockBase, but in the future the
// only VPUsers should either be recipes or live-outs.
Block
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
Expand Up @@ -202,5 +202,12 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
return false;
}
}

for (auto &KV : Plan.getLiveOuts())
if (KV.second->getNumOperands() != 1) {
errs() << "live outs must have a single operand\n";
return false;
}

return true;
}
Expand Up @@ -52,11 +52,11 @@ define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 {
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 2
; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 2 x i64*> [[TMP12]], i32 [[TMP22]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[L_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IDX]], [[L_ENTRY:%.*]] ]
Expand Down

0 comments on commit 3bebec6

Please sign in to comment.