Skip to content

Commit

Permalink
[VPlan] Add active-lane-mask as VPlan-to-VPlan transformation.
Browse files Browse the repository at this point in the history
This patch updates the mask creation code to always create compares of
the form (ICMP_ULE, wide canonical IV, backedge-taken-count) up front
when tail folding and introduce active-lane-mask as later
transformation.

This effectively makes (ICMP_ULE, wide canonical IV, backedge-taken-count)
the canonical form for tail-folding early on. Introducing more specific
active-lane-mask recipes is treated as a VPlan-to-VPlan optimization.

This has the advantage of keeping the logic  (and complexity) of
introducing active-lane-mask recipes in a single place, instead of
spreading the logic out across multiple functions. It also simplifies
initial VPlan construction and enables treating introducing EVL as
similar optimization.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D158779
  • Loading branch information
fhahn committed Sep 25, 2023
1 parent ef48e90 commit 97687b7
Show file tree
Hide file tree
Showing 15 changed files with 457 additions and 373 deletions.
8 changes: 8 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class VPBuilder {

public:
VPBuilder() = default;
VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }

/// Clear the insertion point: created instructions will not be inserted into
/// a block.
Expand Down Expand Up @@ -143,6 +144,13 @@ class VPBuilder {
return createInstruction(Opcode, Operands, DL, Name);
}

VPInstruction *createOverflowingOp(unsigned Opcode,
std::initializer_list<VPValue *> Operands,
VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
DebugLoc DL, const Twine &Name = "") {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
}
VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
}
Expand Down
131 changes: 29 additions & 102 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8065,14 +8065,6 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
return;
}

// If we're using the active lane mask for control flow, then we get the
// mask from the active lane mask PHI that is cached in the VPlan.
TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
if (useActiveLaneMaskForControlFlow(TFStyle)) {
BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi();
return;
}

// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
// constructing the desired canonical IV in the header block as its first
Expand All @@ -8086,14 +8078,8 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
VPValue *BlockMask = nullptr;
if (useActiveLaneMask(TFStyle)) {
VPValue *TC = Plan.getTripCount();
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
nullptr, "active.lane.mask");
} else {
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
}
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
BlockMaskCache[Header] = BlockMask;
}

Expand Down Expand Up @@ -8631,8 +8617,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

// Add the necessary canonical IV and branch recipes required to control the
// loop.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
TailFoldingStyle Style) {
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);

Expand All @@ -8644,93 +8630,19 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
// IV by VF * UF.
bool HasNUW = Style == TailFoldingStyle::None;
auto *CanonicalIVIncrement =
new VPInstruction(VPInstruction::CanonicalIVIncrement, {CanonicalIVPHI},
{HasNUW, false}, DL, "index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);

VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
if (useActiveLaneMaskForControlFlow(Style)) {
// Create the active lane mask instruction in the vplan preheader.
VPBasicBlock *VecPreheader =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());

// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
// Part * VF
auto *CanonicalIVIncrementParts =
new VPInstruction(VPInstruction::CanonicalIVIncrementForPart, {StartV},
{HasNUW, false}, DL, "index.part.next");
VecPreheader->appendRecipe(CanonicalIVIncrementParts);

// Create the ActiveLaneMask instruction using the correct start values.
VPValue *TC = Plan.getTripCount();

VPValue *TripCount, *IncrementValue;
if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
// When avoiding a runtime check, the active.lane.mask inside the loop
// uses a modified trip count and the induction variable increment is
// done after the active.lane.mask intrinsic is called.
auto *TCMinusVF =
new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
VecPreheader->appendRecipe(TCMinusVF);
IncrementValue = CanonicalIVPHI;
TripCount = TCMinusVF;
} else {
// When the loop is guarded by a runtime overflow check for the loop
// induction variable increment by VF, we can increment the value before
// the get.active.lane mask and use the unmodified tripcount.
EB->appendRecipe(CanonicalIVIncrement);
IncrementValue = CanonicalIVIncrement;
TripCount = TC;
}
EB->appendRecipe(CanonicalIVIncrement);

auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
{CanonicalIVIncrementParts, TC}, DL,
"active.lane.mask.entry");
VecPreheader->appendRecipe(EntryALM);

// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
Header->insert(LaneMaskPhi, Header->getFirstNonPhi());

// Create the active lane mask for the next iteration of the loop.
CanonicalIVIncrementParts =
new VPInstruction(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {HasNUW, false}, DL);
EB->appendRecipe(CanonicalIVIncrementParts);

auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
{CanonicalIVIncrementParts, TripCount}, DL,
"active.lane.mask.next");
EB->appendRecipe(ALM);
LaneMaskPhi->addOperand(ALM);

if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
// Do the increment of the canonical IV after the active.lane.mask, because
// that value is still based off %CanonicalIVPHI
EB->appendRecipe(CanonicalIVIncrement);
}

// We have to invert the mask here because a true condition means jumping
// to the exit block.
auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
EB->appendRecipe(NotMask);

VPInstruction *BranchBack =
new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
EB->appendRecipe(BranchBack);
} else {
EB->appendRecipe(CanonicalIVIncrement);

// Add the BranchOnCount VPInstruction to the latch.
VPInstruction *BranchBack = new VPInstruction(
VPInstruction::BranchOnCount,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
EB->appendRecipe(BranchBack);
}
// Add the BranchOnCount VPInstruction to the latch.
VPInstruction *BranchBack =
new VPInstruction(VPInstruction::BranchOnCount,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
EB->appendRecipe(BranchBack);
}

// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
Expand Down Expand Up @@ -8817,8 +8729,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);

DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DL,
CM.getTailFoldingStyle(IVUpdateMayOverflow));
TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
// When not folding the tail, we know that the induction increment will not
// overflow.
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);

// Proactively create header mask. Masks for other blocks are created on
// demand.
Expand Down Expand Up @@ -8987,6 +8902,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
return nullptr;

if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
bool WithoutRuntimeCheck =
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
WithoutRuntimeCheck);
}
return Plan;
}

Expand Down Expand Up @@ -9021,8 +8945,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
Term->eraseFromParent();

addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
CM.getTailFoldingStyle());
// Tail folding is not supported for outer loops, so the induction increment
// is guaranteed to not wrap.
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
return Plan;
}

Expand Down
9 changes: 0 additions & 9 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,15 +718,6 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
return Plan;
}

VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
if (isa<VPActiveLaneMaskPHIRecipe>(&R))
return cast<VPActiveLaneMaskPHIRecipe>(&R);
}
return nullptr;
}

void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
VPTransformState &State,
Expand Down
4 changes: 0 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -2682,10 +2682,6 @@ class VPlan {
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}

/// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
/// be only one at most. If there isn't one, then return nullptr.
VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();

void addLiveOut(PHINode *PN, VPValue *V);

void removeLiveOut(PHINode *PN) {
Expand Down
150 changes: 150 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
Expand Down Expand Up @@ -855,3 +856,152 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
removeRedundantExpandSCEVRecipes(Plan);
mergeBlocksIntoPredecessors(Plan);
}

// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
// the loop terminator with a branch-on-cond recipe with the negated
// active-lane-mask as operand. Note that this turns the loop into an
// uncountable one. Only the existing terminator is replaced, all other existing
// recipes/users remain unchanged, except for poison-generating flags being
// dropped from the canonical IV increment. Return the created
// VPActiveLaneMaskPHIRecipe.
//
// The function uses the following definitions:
//
// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
// calculate-trip-count-minus-VF (original TC) : original TC
// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
// CanonicalIVPhi : CanonicalIVIncrement
// %StartV is the canonical induction start value.
//
// The function adds the following recipes:
//
// vector.ph:
// %TripCount = calculate-trip-count-minus-VF (original TC)
// [if DataWithControlFlowWithoutRuntimeCheck]
// %EntryInc = canonical-iv-increment-for-part %StartV
// %EntryALM = active-lane-mask %EntryInc, %TripCount
//
// vector.body:
// ...
// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
// ...
// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
// %ALM = active-lane-mask %InLoopInc, TripCount
// %Negated = Not %ALM
// branch-on-cond %Negated
//
static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
auto *CanonicalIVPHI = Plan.getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();

auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
// TODO: Check if dropping the flags is needed if
// !DataAndControlFlowWithoutRuntimeCheck.
CanonicalIVIncrement->dropPoisonGeneratingFlags();
DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
// Part * VF
auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
VPBuilder Builder(VecPreheader);

// Create the ActiveLaneMask instruction using the correct start values.
VPValue *TC = Plan.getTripCount();

VPValue *TripCount, *IncrementValue;
if (!DataAndControlFlowWithoutRuntimeCheck) {
// When the loop is guarded by a runtime overflow check for the loop
// induction variable increment by VF, we can increment the value before
// the get.active.lane mask and use the unmodified tripcount.
IncrementValue = CanonicalIVIncrement;
TripCount = TC;
} else {
// When avoiding a runtime check, the active.lane.mask inside the loop
// uses a modified trip count and the induction variable increment is
// done after the active.lane.mask intrinsic is called.
IncrementValue = CanonicalIVPHI;
TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
{TC}, DL);
}
auto *EntryIncrement = Builder.createOverflowingOp(
VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
"index.part.next");

// Create the active lane mask instruction in the VPlan preheader.
auto *EntryALM =
Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
DL, "active.lane.mask.entry");

// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
LaneMaskPhi->insertAfter(CanonicalIVPHI);

// Create the active lane mask for the next iteration of the loop before the
// original terminator.
VPRecipeBase *OriginalTerminator = EB->getTerminator();
Builder.setInsertPoint(OriginalTerminator);
auto *InLoopIncrement =
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL);
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{InLoopIncrement, TripCount}, DL,
"active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);

// Replace the original terminator with BranchOnCond. We have to invert the
// mask here because a true condition means jumping to the exit block.
auto *NotMask = Builder.createNot(ALM, DL);
Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
OriginalTerminator->eraseFromParent();
return LaneMaskPhi;
}

void VPlanTransforms::addActiveLaneMask(
VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
bool DataAndControlFlowWithoutRuntimeCheck) {
assert(!DataAndControlFlowWithoutRuntimeCheck ||
UseActiveLaneMaskForControlFlow &&
"DataAndControlFlowWithoutRuntimeCheck implies "
"UseActiveLaneMaskForControlFlow");

auto FoundWidenCanonicalIVUser =
find_if(Plan.getCanonicalIV()->users(),
[](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
assert(FoundWidenCanonicalIVUser &&
"Must have widened canonical IV when tail folding!");
auto *WideCanonicalIV =
cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
VPRecipeBase *LaneMask;
if (UseActiveLaneMaskForControlFlow) {
LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
Plan, DataAndControlFlowWithoutRuntimeCheck);
} else {
LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount()},
nullptr, "active.lane.mask");
LaneMask->insertAfter(WideCanonicalIV);
}

// Walk users of WideCanonicalIV and replace all compares of the form
// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
// active-lane-mask.
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
auto *CompareToReplace = dyn_cast<VPInstruction>(U);
if (!CompareToReplace ||
CompareToReplace->getOpcode() != Instruction::ICmp ||
CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
CompareToReplace->getOperand(1) != BTC)
continue;

assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
"WidenCanonicalIV must be the first operand of the compare");
CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue());
CompareToReplace->eraseFromParent();
}
}

0 comments on commit 97687b7

Please sign in to comment.