-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Replace VPRecipeOrVPValue with VP2VP recipe simplification. #76090
Conversation
Move simplification of VPBlendRecipes from early VPlan construction to VPlan-to-VPlan based recipe simplification. This simplifies initial construction. Note that some in-loop reduction tests are failing at the moment, due to the reduction predicate being created after the reduction recipe. I will provide a patch for that soon.
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesMove simplification of VPBlendRecipes from early VPlan construction to VPlan-to-VPlan based recipe simplification. This simplifies initial construction. Note that some in-loop reduction tests are failing at the moment, due to the reduction predicate being created after the reduction recipe. I will provide a patch for that soon. Full diff: https://github.com/llvm/llvm-project/pull/76090.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846d1..609a927d23754b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8256,31 +8256,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
return nullptr;
}
-VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
- ArrayRef<VPValue *> Operands,
- VPlanPtr &Plan) {
- // If all incoming values are equal, the incoming VPValue can be used directly
- // instead of creating a new VPBlendRecipe.
- if (llvm::all_equal(Operands))
- return Operands[0];
-
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
+ ArrayRef<VPValue *> Operands,
+ VPlanPtr &Plan) {
unsigned NumIncoming = Phi->getNumIncomingValues();
- // For in-loop reductions, we do not need to create an additional select.
- VPValue *InLoopVal = nullptr;
- for (unsigned In = 0; In < NumIncoming; In++) {
- PHINode *PhiOp =
- dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
- if (PhiOp && CM.isInLoopReduction(PhiOp)) {
- assert(!InLoopVal && "Found more than one in-loop reduction!");
- InLoopVal = Operands[In];
- }
- }
-
- assert((!InLoopVal || NumIncoming == 2) &&
- "Found an in-loop reduction for PHI with unexpected number of "
- "incoming values");
- if (InLoopVal)
- return Operands[Operands[0] == InLoopVal ? 1 : 0];
// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
@@ -8292,13 +8271,13 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
- assert((EdgeMask || NumIncoming == 1) &&
+ assert((EdgeMask || NumIncoming == 1 || Operands[In] == Operands[0]) &&
"Multiple predecessors with one having a full mask");
OperandsWithMask.push_back(Operands[In]);
if (EdgeMask)
OperandsWithMask.push_back(EdgeMask);
}
- return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
+ return new VPBlendRecipe(Phi, OperandsWithMask);
}
VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
@@ -8464,9 +8443,8 @@ void VPRecipeBuilder::fixHeaderPhis() {
}
}
-VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
- VFRange &Range,
- VPlan &Plan) {
+VPRecipeBase *VPRecipeBuilder::handleReplication(Instruction *I, VFRange &Range,
+ VPlan &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
@@ -8518,14 +8496,12 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
IsUniform, BlockInMask);
- return toVPRecipeResult(Recipe);
+ return Recipe;
}
-VPRecipeOrVPValueTy
-VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
- ArrayRef<VPValue *> Operands,
- VFRange &Range, VPBasicBlock *VPBB,
- VPlanPtr &Plan) {
+VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
+ Instruction *Instr, ArrayRef<VPValue *> Operands, VFRange &Range,
+ VPBasicBlock *VPBB, VPlanPtr &Plan) {
// First, check for specific widening recipes that deal with inductions, Phi
// nodes, calls and memory operations.
VPRecipeBase *Recipe;
@@ -8538,7 +8514,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
recordRecipeOf(Phi);
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
- return toVPRecipeResult(Recipe);
+ return Recipe;
VPHeaderPHIRecipe *PhiRecipe = nullptr;
assert((Legal->isReductionVariable(Phi) ||
@@ -8570,13 +8546,13 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
recordRecipeOf(Inc);
PhisToFix.push_back(PhiRecipe);
- return toVPRecipeResult(PhiRecipe);
+ return PhiRecipe;
}
if (isa<TruncInst>(Instr) &&
(Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
Range, *Plan)))
- return toVPRecipeResult(Recipe);
+ return Recipe;
// All widen recipes below deal only with VF > 1.
if (LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8584,29 +8560,29 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return nullptr;
if (auto *CI = dyn_cast<CallInst>(Instr))
- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
+ return tryToWidenCall(CI, Operands, Range, Plan);
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+ return tryToWidenMemory(Instr, Operands, Range, Plan);
if (!shouldWiden(Instr, Range))
return nullptr;
if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
- return toVPRecipeResult(new VPWidenGEPRecipe(
- GEP, make_range(Operands.begin(), Operands.end())));
+ return new VPWidenGEPRecipe(GEP,
+ make_range(Operands.begin(), Operands.end()));
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
- return toVPRecipeResult(new VPWidenSelectRecipe(
- *SI, make_range(Operands.begin(), Operands.end())));
+ return new VPWidenSelectRecipe(
+ *SI, make_range(Operands.begin(), Operands.end()));
}
if (auto *CI = dyn_cast<CastInst>(Instr)) {
- return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
- CI->getType(), *CI));
+ return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
+ *CI);
}
- return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
+ return tryToWiden(Instr, Operands, VPBB, Plan);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8786,22 +8762,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
- auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
+ VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(
Instr, Operands, Range, VPBB, Plan);
- if (!RecipeOrValue)
- RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
- // If Instr can be simplified to an existing VPValue, use it.
- if (isa<VPValue *>(RecipeOrValue)) {
- auto *VPV = cast<VPValue *>(RecipeOrValue);
- Plan->addVPValue(Instr, VPV);
- // If the re-used value is a recipe, register the recipe for the
- // instruction, in case the recipe for Instr needs to be recorded.
- if (VPRecipeBase *R = VPV->getDefiningRecipe())
- RecipeBuilder.setRecipe(Instr, R);
- continue;
- }
- // Otherwise, add the new recipe.
- VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(Instr, Range, *Plan);
for (auto *Def : Recipe->definedValues()) {
auto *UV = Def->getUnderlyingValue();
Plan->addVPValue(UV, Def);
@@ -8966,10 +8930,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
}
// Adjust the recipes for reductions. For in-loop reductions the chain of
-// instructions leading from the loop exit instr to the phi need to be converted
-// to reductions, with one operand being vector and the other being the scalar
-// reduction chain. For other reductions, a select is introduced between the phi
-// and live-out recipes when folding the tail.
+// instructions leading from the loop exit instr to the phi need to be
+// converted to reductions, with one operand being vector and the other
+// being the scalar reduction chain. For other reductions, a select is
+// introduced between the phi and live-out recipes when folding the tail.
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
@@ -9079,6 +9043,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
VecOp = FMulRecipe;
} else {
+ if (PhiR->isInLoop() && isa<VPBlendRecipe>(CurrentLink))
+ continue;
+
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
if (isa<VPWidenRecipe>(CurrentLink)) {
assert(isa<CmpInst>(CurrentLinkI) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a09089e..0d8d7fc6f81770 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,8 +21,6 @@ class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class TargetLibraryInfo;
-using VPRecipeOrVPValueTy = PointerUnion<VPRecipeBase *, VPValue *>;
-
/// Helper class to create VPRecipies from IR instructions.
class VPRecipeBuilder {
/// The loop that we evaluate.
@@ -88,8 +86,8 @@ class VPRecipeBuilder {
/// or a new VPBlendRecipe otherwise. Currently all such phi nodes are turned
/// into a sequence of select instructions as the vectorizer currently
/// performs full if-conversion.
- VPRecipeOrVPValueTy tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands,
- VPlanPtr &Plan);
+ VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands,
+ VPlanPtr &Plan);
/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
@@ -103,9 +101,6 @@ class VPRecipeBuilder {
VPRecipeBase *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB, VPlanPtr &Plan);
- /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
- VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }
-
public:
VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
LoopVectorizationLegality *Legal,
@@ -116,12 +111,11 @@ class VPRecipeBuilder {
/// Check if an existing VPValue can be used for \p Instr or a recipe can be
/// create for \p I withing the given VF \p Range. If an existing VPValue can
- /// be used or if a recipe can be created, return it. Otherwise return a
- /// VPRecipeOrVPValueTy with nullptr.
- VPRecipeOrVPValueTy tryToCreateWidenRecipe(Instruction *Instr,
- ArrayRef<VPValue *> Operands,
- VFRange &Range, VPBasicBlock *VPBB,
- VPlanPtr &Plan);
+ /// be used or if a recipe can be created, return it.
+ VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr,
+ ArrayRef<VPValue *> Operands,
+ VFRange &Range, VPBasicBlock *VPBB,
+ VPlanPtr &Plan);
/// Set the recipe created for given ingredient. This operation is a no-op for
/// ingredients that were not marked using a nullptr entry in the map.
@@ -165,8 +159,7 @@ class VPRecipeBuilder {
/// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
/// last operand. Range.End may be decreased to ensure same recipe behavior
/// from \p Range.Start to \p Range.End.
- VPRecipeOrVPValueTy handleReplication(Instruction *I, VFRange &Range,
- VPlan &Plan);
+ VPRecipeBase *handleReplication(Instruction *I, VFRange &Range, VPlan &Plan);
/// Add the incoming values from the backedge to reduction & first-order
/// recurrence cross-iteration phis.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 33132880d5a444..317d7806bf5511 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -806,6 +806,38 @@ static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
/// Try to simplify recipe \p R.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+ // Try to remove redundant blend recipes.
+ if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
+ if (Blend->getNumIncomingValues() == 1) {
+ Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
+ Blend->eraseFromParent();
+ return;
+ }
+
+ bool AllEqual = true;
+ for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+ AllEqual &= Blend->getIncomingValue(0) == Blend->getIncomingValue(I);
+ if (AllEqual) {
+ Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
+ Blend->eraseFromParent();
+ return;
+ }
+ if (Blend->getNumIncomingValues() != 2)
+ return;
+ auto IsInLoopReduction = [](VPValue *VPV) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(VPV);
+ return PhiR && PhiR->isInLoop();
+ };
+ if (IsInLoopReduction(Blend->getIncomingValue(0))) {
+ Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
+ Blend->eraseFromParent();
+ } else if (IsInLoopReduction(Blend->getIncomingValue(1))) {
+ Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
+ Blend->eraseFromParent();
+ }
+ return;
+ }
+
switch (getOpcodeForRecipe(R)) {
case Instruction::Mul: {
VPValue *A = R.getOperand(0);
@@ -996,8 +1028,8 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
removeRedundantCanonicalIVs(Plan);
removeRedundantInductionCasts(Plan);
- optimizeInductions(Plan, SE);
simplifyRecipes(Plan, SE.getContext());
+ optimizeInductions(Plan, SE);
removeDeadRecipes(Plan);
createAndOptimizeReplicateRegions(Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3bf91115debb7d..7ce2e5974f60b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -112,7 +112,6 @@ struct VPlanTransforms {
/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
/// them with already existing recipes expanding the same SCEV expression.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan);
-
};
} // namespace llvm
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice cleanup, thanks for pursuing!
Wonder if tryTo*()
's should return more specific recipes than VPRecipeBase*
, as in tryToBlend()
.
if (Blend->getNumIncomingValues() == 1) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | ||
Blend->eraseFromParent(); | ||
return; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (Blend->getNumIncomingValues() == 1) { | |
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | |
Blend->eraseFromParent(); | |
return; | |
} |
Single incoming is a corner case of all incomings being equal, can be left to be handled below.
Wonder if/where it actually occurs - lcssa/header phi's should remain intact.
All incomings being equal should be optimized at the scalar phi level - when such an initial VPlan will be made available.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed, thanks!
// instructions leading from the loop exit instr to the phi need to be | ||
// converted to reductions, with one operand being vector and the other | ||
// being the scalar reduction chain. For other reductions, a select is | ||
// introduced between the phi and live-out recipes when folding the tail. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: formatting change independent of this patch, best pushed separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
undone, thanks!
if (Blend->getNumIncomingValues() != 2) | ||
return; | ||
auto IsInLoopReduction = [](VPValue *VPV) { | ||
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(VPV); | ||
return PhiR && PhiR->isInLoop(); | ||
}; | ||
if (IsInLoopReduction(Blend->getIncomingValue(0))) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); | ||
Blend->eraseFromParent(); | ||
} else if (IsInLoopReduction(Blend->getIncomingValue(1))) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | ||
Blend->eraseFromParent(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The mandatory removal of a blend for conditional in-loop reductions should probably better be placed alongside its folding into Reduction recipe, by adjustRecipesForReductions(), than as a standalone redundancy elimination optimization here.
I.e., folding
S = phi(init, S")
S' = S + a[i]
S" = Cond ? S' : S
into
S = phi(init, S') // in-loop Reduction Phi recipe
Inc = Cond ? a[i] : identity
S' = S + Inc // in-loop Reduction recipe
// S" = Cond ? S' : S // when in-loop vectorized, drop or use AnyOf(Cond)
(This refers to 12fb133)
Note that a similar blending-of-the-increment rather than of the post-incremented value, may also apply to non in-loop reductions - as an optional optimization. This could be useful, e.g., when a blend-of-load-with-identity can be replaced by setting the passthru operand of a masked load to identity
, as in the above example.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved as suggested, thanks!
@@ -112,7 +112,6 @@ struct VPlanTransforms { | |||
/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing | |||
/// them with already existing recipes expanding the same SCEV expression. | |||
static void removeRedundantExpandSCEVRecipes(VPlan &Plan); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: unrelated empty line can be removed independently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That was unintentional, undone, thanks!
simplifyRecipes(Plan, SE.getContext()); | ||
optimizeInductions(Plan, SE); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this phase ordering change still needed, if the mandatory removal of a conditional in-loop reduction blend is moved away from simplifyRecipes(), and into adjustRecipesForReductions()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without the change, here's a function in lvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
where optimizeInductions
won't simplify an induction, because there's a wide blend use left.
@@ -8292,13 +8271,13 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, | |||
for (unsigned In = 0; In < NumIncoming; In++) { | |||
VPValue *EdgeMask = | |||
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); | |||
assert((EdgeMask || NumIncoming == 1) && | |||
assert((EdgeMask || NumIncoming == 1 || Operands[In] == Operands[0]) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A full mask is allowed when multiple predecessors are involved, provided all are with the same operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There can be degenerate cases like
loop.1.header:
%iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1.latch ]
%l = load i32, ptr %A
br i1 %c, label %loop.1.latch, label %loop.1.latch
loop.1.latch:
%p = phi i32 [ %l, %loop.1.header ], [ %l, %loop.1.header ]
%iv.1.next = add nuw nsw i32 %iv.1, 1
%ec.1 = icmp eq i32 %iv.1.next, 100
br i1 %ec.1, label %exit.1, label %loop.1.header
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, right. This empty edge-mask case is a bit subtle.
Blend recipe expects an even number of operands, referring to them as {incoming-value, non-null edge-mask} pairs, or else a single operand, referring to it as an incoming-value.
An empty edge-mask is formed when its predecessor has an empty block-mask and a single successor or two identical successors, as in the above degenerate example. In the latter case, however, OperandsWithMask will contain two entries, both being (the same) incoming-values, as their edge-masks are both null. Note that three or more incoming-values must (all) have non-null edge-masks, otherwise they must post/dominate each other.
So maybe clearer to have something like:
for (unsigned In = 0; In < NumIncoming; In++) {
OperandsWithMask.push_back(Operands[In]);
VPValue *EdgeMask =
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
if (!EdgeMask) {
assert(In == 0 && "Both null and non-null edge masks found");
assert(llvm::all_equal(Operands) && "Distinct incoming values with one having a full mask");
break;
}
OperandsWithMask.push_back(EdgeMask);
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adjusted, thanks!
if (PhiR->isInLoop() && isa<VPBlendRecipe>(CurrentLink)) | ||
continue; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replace the blend here by its non-header-phi operand (possibly after the loop if needed, and/or updating PreviousLink), as mentioned below?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
At the moment, block and edge masks are created on demand, which means that they are inserted at the point where they are demanded and then cached. It is possible that the mask for a block is looked up later at a point that's not dominated by the point where the mask has been inserted. To avoid this, create masks up front on entry to the corresponding basic block and leave it to VPlan simplification to remove unneeded masks. Note that we need to create masks for all blocks, if any of the blocks in the loop needs predication, as computing the mask of a block depends on the masks of its predecessor. Needed for llvm#76090.
At the moment, block and edge masks are created on demand, which means that they are inserted at the point where they are demanded and then cached. It is possible that the mask for a block is looked up later at a point that's not dominated by the point where the mask has been inserted. To avoid this, create masks up front on entry to the corresponding basic block and leave it to VPlan simplification to remove unneeded masks. Note that we need to create masks for all blocks, if any of the blocks in the loop needs predication, as computing the mask of a block depends on the masks of its predecessor. Needed for #76090. #76635
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for taking a look, comments should be addressed!
Wonder if tryTo*()'s should return more specific recipes than VPRecipeBase*, as in tryToBlend().
That would probably be good now! Should I adjust the current patch or do as follow-up?
@@ -8292,13 +8271,13 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, | |||
for (unsigned In = 0; In < NumIncoming; In++) { | |||
VPValue *EdgeMask = | |||
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); | |||
assert((EdgeMask || NumIncoming == 1) && | |||
assert((EdgeMask || NumIncoming == 1 || Operands[In] == Operands[0]) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There can be degenerate cases like
loop.1.header:
%iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1.latch ]
%l = load i32, ptr %A
br i1 %c, label %loop.1.latch, label %loop.1.latch
loop.1.latch:
%p = phi i32 [ %l, %loop.1.header ], [ %l, %loop.1.header ]
%iv.1.next = add nuw nsw i32 %iv.1, 1
%ec.1 = icmp eq i32 %iv.1.next, 100
br i1 %ec.1, label %exit.1, label %loop.1.header
if (Blend->getNumIncomingValues() == 1) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | ||
Blend->eraseFromParent(); | ||
return; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed, thanks!
if (PhiR->isInLoop() && isa<VPBlendRecipe>(CurrentLink)) | ||
continue; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
if (Blend->getNumIncomingValues() != 2) | ||
return; | ||
auto IsInLoopReduction = [](VPValue *VPV) { | ||
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(VPV); | ||
return PhiR && PhiR->isInLoop(); | ||
}; | ||
if (IsInLoopReduction(Blend->getIncomingValue(0))) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); | ||
Blend->eraseFromParent(); | ||
} else if (IsInLoopReduction(Blend->getIncomingValue(1))) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | ||
Blend->eraseFromParent(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved as suggested, thanks!
simplifyRecipes(Plan, SE.getContext()); | ||
optimizeInductions(Plan, SE); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without the change, here's a function in lvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
where optimizeInductions
won't simplify an induction, because there's a wide blend use left.
@@ -112,7 +112,6 @@ struct VPlanTransforms { | |||
/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing | |||
/// them with already existing recipes expanding the same SCEV expression. | |||
static void removeRedundantExpandSCEVRecipes(VPlan &Plan); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That was unintentional, undone, thanks!
// instructions leading from the loop exit instr to the phi need to be | ||
// converted to reductions, with one operand being vector and the other | ||
// being the scalar reduction chain. For other reductions, a select is | ||
// introduced between the phi and live-out recipes when folding the tail. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
undone, thanks!
@@ -8292,13 +8271,13 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, | |||
for (unsigned In = 0; In < NumIncoming; In++) { | |||
VPValue *EdgeMask = | |||
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); | |||
assert((EdgeMask || NumIncoming == 1) && | |||
assert((EdgeMask || NumIncoming == 1 || Operands[In] == Operands[0]) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, right. This empty edge-mask case is a bit subtle.
Blend recipe expects an even number of operands, referring to them as {incoming-value, non-null edge-mask} pairs, or else a single operand, referring to it as an incoming-value.
An empty edge-mask is formed when its predecessor has an empty block-mask and a single successor or two identical successors, as in the above degenerate example. In the latter case, however, OperandsWithMask will contain two entries, both being (the same) incoming-values, as their edge-masks are both null. Note that three or more incoming-values must (all) have non-null edge-masks, otherwise they must post/dominate each other.
So maybe clearer to have something like:
for (unsigned In = 0; In < NumIncoming; In++) {
OperandsWithMask.push_back(Operands[In]);
VPValue *EdgeMask =
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
if (!EdgeMask) {
assert(In == 0 && "Both null and non-null edge masks found");
assert(llvm::all_equal(Operands) && "Distinct incoming values with one having a full mask");
break;
}
OperandsWithMask.push_back(EdgeMask);
}
VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, | ||
VFRange &Range, | ||
VPlan &Plan) { | ||
VPRecipeBase *VPRecipeBuilder::handleReplication(Instruction *I, VFRange &Range, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPRecipeBase *VPRecipeBuilder::handleReplication(Instruction *I, VFRange &Range, | |
VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, VFRange &Range, |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, also updated other helpers to return more specific types
@@ -8999,6 +8963,18 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |||
LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); | |||
VecOp = FMulRecipe; | |||
} else { | |||
auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); | |||
if (PhiR->isInLoop() && Blend) { | |||
assert(Blend->getNumIncomingValues() == 2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: assert message.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added, thanks!
[PhiR](VPValue *Op) { return Op == PhiR; })); | ||
if (Blend->getIncomingValue(0) == PhiR) | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); | ||
else |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
else | |
else { | |
assert(Blend->getIncomingValue(1) == PhiR && "..."); |
instead of above assert(any_of(...))?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, thanks!
@@ -8999,6 +8963,18 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |||
LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); | |||
VecOp = FMulRecipe; | |||
} else { | |||
auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add folding of inloop blend to documentation above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -88,8 +86,8 @@ class VPRecipeBuilder { | |||
/// or a new VPBlendRecipe otherwise. Currently all such phi nodes are turned |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment needs updating.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
@@ -116,12 +111,11 @@ class VPRecipeBuilder { | |||
|
|||
/// Check if an existing VPValue can be used for \p Instr or a recipe can be | |||
/// create for \p I withing the given VF \p Range. If an existing VPValue can |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment needs updating. (Plus fixing "withing" typo).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { | ||
bool AllEqual = true; | ||
for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) | ||
AllEqual &= Blend->getIncomingValue(0) == Blend->getIncomingValue(I); | ||
if (AllEqual) { | ||
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | ||
Blend->eraseFromParent(); | ||
return; | ||
} | ||
return; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can be simplified?
if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { | |
bool AllEqual = true; | |
for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) | |
AllEqual &= Blend->getIncomingValue(0) == Blend->getIncomingValue(I); | |
if (AllEqual) { | |
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | |
Blend->eraseFromParent(); | |
return; | |
} | |
return; | |
} | |
if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { | |
for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) | |
if (Blend->getIncomingValue(0) != Blend->getIncomingValue(I)) | |
return; | |
Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); | |
Blend->eraseFromParent(); | |
return; | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplified thanks!
At the moment, block and edge masks are created on demand, which means that they are inserted at the point where they are demanded and then cached. It is possible that the mask for a block is looked up later at a point that's not dominated by the point where the mask has been inserted. To avoid this, create masks up front on entry to the corresponding basic block and leave it to VPlan simplification to remove unneeded masks. Note that we need to create masks for all blocks, if any of the blocks in the loop needs predication, as computing the mask of a block depends on the masks of its predecessor. Needed for llvm#76090. llvm#76635
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me, thanks for cleaning this up!
Adding a couple of minor nits.
@@ -9027,7 +8994,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |||
// the phi until LoopExitValue. We keep track of the previous item | |||
// (PreviousLink) to tell which of the two operands of a Link will remain | |||
// scalar and which will be reduced. For minmax by select(cmp), Link will be | |||
// the select instructions. | |||
// the select instructions. Blend recipes will get folded to their non-phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// the select instructions. Blend recipes will get folded to their non-phi | |
// the select instructions. Blend recipes of in-loop reduction phi's will get folded to their non-phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
assert(Blend->getIncomingValue(1) == PhiR && | ||
"PhiR must be an operand of the blend"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: may be slightly more consistent to have the assert first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -100,11 +99,8 @@ class VPRecipeBuilder { | |||
/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Indeed back to returning VPWidenRecipe as documented...)
@@ -827,6 +827,16 @@ static unsigned getOpcodeForRecipe(VPRecipeBase &R) { | |||
|
|||
/// Try to simplify recipe \p R. | |||
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |||
// Try to remove redundant blend recipes. | |||
if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: may be worth defining and reusing IncomingValue0 = Blend->getIncomingValue(0);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
Similarly to how block masks are created up front and later only retrieved also make sure masks are created in cases where edge masks are needed, i.e. blend recipes. Creating block-in masks for all blocks in the loop also ensures edge masks for all relevant edges have been created. Later, the new getEdgeMask can be used to look up cached edge masks. This makes sure edge masks are available in all cases for #76090.
Move simplification of VPBlendRecipes from early VPlan construction to VPlan-to-VPlan based recipe simplification. This simplifies initial construction.
Note that some in-loop reduction tests are failing at the moment, due to the reduction predicate being created after the reduction recipe. I will provide a patch for that soon.