Skip to content

Commit

Permalink
[LV] Remove redundant IV casts using VPlan (NFCI).
Browse files Browse the repository at this point in the history
This patch simplifies handling of redundant induction casts, by
removing dead cast instructions after initial VPlan construction.
This has the following benefits:

  1. fixes a crash
     (see @test_optimized_cast_induction_feeding_first_order_recurrence)
  2. Simplifies VPWidenIntOrFpInduction to a single-def recipes
  3. Retires recordVectorLoopValueForInductionCast.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D115112
  • Loading branch information
fhahn committed Dec 10, 2021
1 parent 2586c23 commit 505ad03
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 115 deletions.
109 changes: 15 additions & 94 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -510,7 +510,7 @@ class InnerLoopVectorizer {
/// the corresponding type.
void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
Value *Start, TruncInst *Trunc, VPValue *Def,
VPValue *CastDef, VPTransformState &State);
VPTransformState &State);

/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
Expand Down Expand Up @@ -621,7 +621,7 @@ class InnerLoopVectorizer {
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
const InductionDescriptor &ID, VPValue *Def,
VPValue *CastDef, VPTransformState &State);
VPTransformState &State);

/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
Expand All @@ -631,7 +631,6 @@ class InnerLoopVectorizer {
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def,
VPValue *CastDef,
VPTransformState &State);

/// Returns true if an instruction \p I should be scalarized instead of
Expand All @@ -641,29 +640,6 @@ class InnerLoopVectorizer {
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;

/// If there is a cast involved in the induction variable \p ID, which should
/// be ignored in the vectorized loop body, this function records the
/// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
/// cast. We had already proved that the casted Phi is equal to the uncasted
/// Phi in the vectorized loop (under a runtime guard), and therefore
/// there is no need to vectorize the cast - the same value can be used in the
/// vector loop for both the Phi and the cast.
/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
///
/// \p EntryVal is the value from the original loop that maps to the vector
/// phi node and is used to distinguish what is the IV currently being
/// processed - original one (if \p EntryVal is a phi corresponding to the
/// original IV) or the "newly-created" one based on the proof mentioned above
/// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
/// latter case \p EntryVal is a TruncInst and we must not record anything for
/// that IV, but it's error-prone to expect callers of this routine to care
/// about that, hence this explicit parameter.
void recordVectorLoopValueForInductionCast(
const InductionDescriptor &ID, const Instruction *EntryVal,
Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
unsigned Part, unsigned Lane = UINT_MAX);

/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);

Expand Down Expand Up @@ -2358,8 +2334,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {

void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
VPTransformState &State) {
Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");

Expand Down Expand Up @@ -2422,8 +2397,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
State, Part);

LastInduction = cast<Instruction>(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
Expand Down Expand Up @@ -2457,42 +2430,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
return llvm::any_of(IV->users(), isScalarInst);
}

void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
const InductionDescriptor &ID, const Instruction *EntryVal,
Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
unsigned Part, unsigned Lane) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");

// This induction variable is not the phi from the original loop but the
// newly-created IV based on the proof that casted Phi is equal to the
// uncasted Phi in the vectorized loop (under a runtime guard possibly). It
// re-uses the same InductionDescriptor that original IV uses but we don't
// have to do any recording in this case - that is done when original IV is
// processed.
if (isa<TruncInst>(EntryVal))
return;

if (!CastDef) {
assert(ID.getCastInsts().empty() &&
"there are casts for ID, but no CastDef");
return;
}
assert(!ID.getCastInsts().empty() &&
"there is a CastDef, but no casts for ID");
// Only the first Cast instruction in the Casts vector is of interest.
// The rest of the Casts (if exist) have no uses outside the
// induction update chain itself.
if (Lane < UINT_MAX)
State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
else
State.set(CastDef, VectorLoopVal, Part);
}

void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
const InductionDescriptor &ID,
Value *Start, TruncInst *Trunc,
VPValue *Def, VPValue *CastDef,
VPValue *Def,
VPTransformState &State) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
Expand Down Expand Up @@ -2558,8 +2499,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
State, Part);
}
};

Expand All @@ -2581,23 +2520,21 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
State);
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
return;
}

// Try to create a new independent vector induction variable. If we can't
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
State);
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
// the number of instructions in the loop in the common case prior to
// InstCombine. We will be trading one vector extract for each scalar step.
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
return;
}

Expand All @@ -2607,7 +2544,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
Value *ScalarIV = CreateScalarIV(Step);
if (!Cost->isScalarEpilogueAllowed())
CreateSplatIV(ScalarIV, Step);
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}

Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
Expand Down Expand Up @@ -2661,7 +2598,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID,
VPValue *Def, VPValue *CastDef,
VPValue *Def,
VPTransformState &State) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF.isVector() && "VF should be greater than one");
Expand Down Expand Up @@ -2711,8 +2648,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
State.set(Def, Add, Part);
recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
Part);
// It's useful to record the lane values too for the known minimum number
// of elements so we do those below. This improves the code quality when
// trying to extract the first element, for example.
Expand All @@ -2732,8 +2667,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
State.set(Def, Add, VPIteration(Part, Lane));
recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
Part, Lane);
}
}
}
Expand Down Expand Up @@ -8063,18 +7996,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);

// We record as "Dead" also the type-casting instructions we had identified
// during induction analysis. We don't need any handling for them in the
// vectorized loop because we have proven that, under a proper runtime
// test guarding the vectorized loop, the value of the phi, and the casted
// value of the phi, are the same. The last instruction in this casting chain
// will get its scalar/vector/widened def from the scalar/vector/widened def
// of the respective phi node. Any other casts in the induction def-use chain
// have no other uses outside the phi update chain, and will be ignored.
const InductionDescriptor &IndDes = Induction.second;
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
DeadInstructions.insert(Casts.begin(), Casts.end());
}
}

Expand Down Expand Up @@ -8593,9 +8514,7 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
assert(II->getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
const SmallVectorImpl<Instruction *> &Casts = II->getCastInsts();
return new VPWidenIntOrFpInductionRecipe(
Phi, Operands[0], *II, Casts.empty() ? nullptr : Casts.front());
return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
}

return nullptr;
Expand Down Expand Up @@ -9235,6 +9154,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);

VPlanTransforms::removeRedundantInductionCasts(*Plan);

// Now that sink-after is done, move induction recipes for optimized truncates
// to the phi section of the header block.
for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
Expand Down Expand Up @@ -9715,9 +9636,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
State.ILV->widenIntOrFpInduction(
IV, getInductionDescriptor(), getStartValue()->getLiveInIRValue(),
getTruncInst(), getVPValue(0), getCastValue(), State);
State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
getStartValue()->getLiveInIRValue(),
getTruncInst(), getVPValue(0), State);
}

void VPWidenPHIRecipe::execute(VPTransformState &State) {
Expand Down
27 changes: 6 additions & 21 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Expand Up @@ -1004,29 +1004,21 @@ class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {

/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
const InductionDescriptor &IndDesc;

public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
const InductionDescriptor &IndDesc,
Instruction *Cast = nullptr)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV),
IndDesc(IndDesc) {
new VPValue(IV, this);

if (Cast)
new VPValue(Cast, this);
}
const InductionDescriptor &IndDesc)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
IV(IV), IndDesc(IndDesc) {}

VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
const InductionDescriptor &IndDesc,
TruncInst *Trunc)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV),
IndDesc(IndDesc) {
new VPValue(Trunc, this);
}
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
IV(IV), IndDesc(IndDesc) {}

~VPWidenIntOrFpInductionRecipe() override = default;

Expand All @@ -1048,13 +1040,6 @@ class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
/// Returns the start value of the induction.
VPValue *getStartValue() { return getOperand(0); }

/// Returns the cast VPValue, if one is attached, or nullptr otherwise.
VPValue *getCastValue() {
if (getNumDefinedValues() != 2)
return nullptr;
return getVPValue(1);
}

/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
TruncInst *getTruncInst() {
Expand Down
32 changes: 32 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Expand Up @@ -294,3 +294,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
delete ToDelete;
return Changed;
}

void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
if (!IV || IV->getTruncInst())
continue;

// Visit all casts connected to IV and in Casts. Collect them.
// remember them for removal.
auto &Casts = IV->getInductionDescriptor().getCastInsts();
VPValue *FindMyCast = IV;
for (Instruction *IRCast : reverse(Casts)) {
VPRecipeBase *FoundUserCast = nullptr;
for (auto *U : FindMyCast->users()) {
auto *UserCast = cast<VPRecipeBase>(U);
if (UserCast->getNumDefinedValues() == 1 &&
UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
FoundUserCast = UserCast;
break;
}
}
assert(FoundUserCast && "Missing a cast to remove");
CastsToRemove.emplace_back(FoundUserCast, IV);
FindMyCast = FoundUserCast->getVPSingleValue();
}
}
for (auto &E : CastsToRemove) {
E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
E.first->eraseFromParent();
}
}
8 changes: 8 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Expand Up @@ -37,6 +37,14 @@ struct VPlanTransforms {
static bool sinkScalarOperands(VPlan &Plan);

static bool mergeReplicateRegions(VPlan &Plan);

/// Remove redundant casts of inductions.
///
/// Such redundant casts are casts of induction variables that can be ignored,
/// because we already proved that the casted phi is equal to the uncasted phi
/// in the vectorized loop. There is no need to vectorize the cast - the same
/// value can be used for both the phi and casts in the vector loop.
static void removeRedundantInductionCasts(VPlan &Plan);
};

} // namespace llvm
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/Transforms/LoopVectorize/induction.ll
Expand Up @@ -933,3 +933,41 @@ loop:
exit:
ret void
}

; Test case where %iv.2.ext and %iv.2.conv become redundant due to the SCEV
; predicates generated for the vector loop. They should be removed in the
; vector loop.
define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, i32* %ptr) {
; CHECK-LABEL: @test_optimized_cast_induction_feeding_first_order_recurrence(
; CHECK-LABEL: vector.body:
; CHECK-NEXT: [[MAIN_IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[MAIN_IV_NEXT:%.+]], %vector.body ]
; CHECK-NEXT: [[VEC_RECUR:%.+]] = phi <2 x i32> [ <i32 poison, i32 0>, %vector.ph ], [ [[VEC_IV:%.+]], %vector.body ]
; CHECK-NEXT: [[VEC_IV]] = phi <2 x i32> [ %induction, %vector.ph ], [ [[VEC_IV_NEXT:%.+]], %vector.body ]
; CHECK-NEXT: [[MAIN_IV_0:%.+]] = add i64 [[MAIN_IV]], 0
; CHECK-NEXT: [[RECUR_SHUFFLE:%.+]] = shufflevector <2 x i32> [[VEC_RECUR]], <2 x i32> [[VEC_IV]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* %ptr, i64 [[MAIN_IV_0]]
; CHECK-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
; CHECK-NEXT: [[GEP_CAST:%.+]] = bitcast i32* [[GEP1]] to <2 x i32>*
; CHECK-NEXT: store <2 x i32> [[RECUR_SHUFFLE]], <2 x i32>* [[GEP_CAST]], align 4
; CHECK-NEXT: [[MAIN_IV_NEXT]] = add nuw i64 [[MAIN_IV]], 2
; CHECK-NEXT: [[VEC_IV_NEXT]] = add <2 x i32> [[VEC_IV]],
;
entry:
br label %loop

loop:
%for = phi i32 [ 0, %entry ], [ %iv.2.conv, %loop ]
%iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
%iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
%iv.2.ext = shl i32 %iv.2, 24
%iv.2.conv = ashr exact i32 %iv.2.ext, 24
%gep = getelementptr inbounds i32, i32* %ptr, i64 %iv.1
store i32 %for, i32* %gep, align 4
%iv.2.next = add nsw i32 %iv.2.conv, %step
%iv.1.next = add nuw nsw i64 %iv.1, 1
%exitcond = icmp eq i64 %iv.1.next, %n
br i1 %exitcond, label %exit, label %loop

exit:
ret void
}

0 comments on commit 505ad03

Please sign in to comment.