Skip to content

Commit

Permalink
[VPlan] Merge predicated-triangle regions, after sinking.
Browse files Browse the repository at this point in the history
Sinking scalar operands into predicated-triangle regions may allow
merging regions. This patch adds a VPlan-to-VPlan transform that tries
to merge predicate-triangle regions after sinking.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D100260
  • Loading branch information
fhahn committed Jun 28, 2021
1 parent 6942076 commit 80aa7e1
Show file tree
Hide file tree
Showing 12 changed files with 899 additions and 1,179 deletions.
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -9298,6 +9298,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}

VPlanTransforms::sinkScalarOperands(*Plan);
VPlanTransforms::mergeReplicateRegions(*Plan);

std::string PlanName;
raw_string_ostream RSO(PlanName);
Expand Down
135 changes: 135 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Expand Up @@ -148,3 +148,138 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
}
return Changed;
}

/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
/// the mask.
VPValue *getPredicatedMask(VPRegionBlock *R) {
auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
if (!EntryBB || EntryBB->size() != 1 ||
!isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
return nullptr;

return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
}

/// If \p R is a triangle region, return the 'then' block of the triangle.
static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
if (EntryBB->getNumSuccessors() != 2)
return nullptr;

auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
if (!Succ0 || !Succ1)
return nullptr;

if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
return nullptr;
if (Succ0->getSingleSuccessor() == Succ1)
return Succ0;
if (Succ1->getSingleSuccessor() == Succ0)
return Succ1;
return nullptr;
}

bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
SetVector<VPRegionBlock *> DeletedRegions;
bool Changed = false;

// Collect region blocks to process up-front, to avoid iterator invalidation
// issues while merging regions.
SmallVector<VPRegionBlock *, 8> CandidateRegions(
VPBlockUtils::blocksOnly<VPRegionBlock>(depth_first(
VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()))));

// Check if Base is a predicated triangle, followed by an empty block,
// followed by another predicate triangle. If that's the case, move the
// recipes from the first to the second triangle.
for (VPRegionBlock *Region1 : CandidateRegions) {
if (DeletedRegions.contains(Region1))
continue;
auto *MiddleBasicBlock =
dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
continue;

auto *Region2 =
dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
if (!Region2)
continue;

VPValue *Mask1 = getPredicatedMask(Region1);
VPValue *Mask2 = getPredicatedMask(Region2);
if (!Mask1 || Mask1 != Mask2)
continue;
VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
if (!Then1 || !Then2)
continue;

assert(Mask1 && Mask2 && "both region must have conditions");

// Note: No fusion-preventing memory dependencies are expected in either
// region. Such dependencies should be rejected during earlier dependence
// checks, which guarantee accesses can be re-ordered for vectorization.
//
// If a recipe is used by a first-order recurrence phi, we cannot move it at
// the moment: a recipe R feeding a first order recurrence phi must allow
// for a *vector* shuffle to be inserted immediately after it, and therefore
// if R is *scalarized and predicated* it must appear last in its basic
// block. In addition, other recipes may need to "sink after" R, so best if
// R not be moved at all.
auto IsImmovableRecipe = [](VPRecipeBase &R) {
assert(R.getNumDefinedValues() <= 1 &&
"no multi-defs are expected in predicated blocks");
for (VPUser *U : R.getVPValue()->users()) {
auto *UI = dyn_cast<VPRecipeBase>(U);
if (!UI)
continue;
auto *PhiR = dyn_cast<VPWidenPHIRecipe>(UI);
if (PhiR && !PhiR->getRecurrenceDescriptor())
return true;
}
return false;
};
if (any_of(*Then1, IsImmovableRecipe))
continue;

// Move recipes to the successor region.
for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());

auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());

// Move VPPredInstPHIRecipes from the merge block to the successor region's
// merge block. Update all users inside the successor region to use the
// original values.
for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
VPValue *PredInst1 =
cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
for (VPUser *U : Phi1ToMove.getVPValue()->users()) {
auto *UI = dyn_cast<VPRecipeBase>(U);
if (!UI || UI->getParent() != Then2)
continue;
for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) {
if (Phi1ToMove.getVPValue() != U->getOperand(I))
continue;
U->setOperand(I, PredInst1);
}
}

Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
}

// Finally, remove the first region.
for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
VPBlockUtils::disconnectBlocks(Pred, Region1);
VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
}
VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
DeletedRegions.insert(Region1);
}

for (VPRegionBlock *ToDelete : DeletedRegions)
delete ToDelete;
return Changed;
}
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Expand Up @@ -30,6 +30,8 @@ struct VPlanTransforms {
SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);

static bool sinkScalarOperands(VPlan &Plan);

static bool mergeReplicateRegions(VPlan &Plan);
};

} // namespace llvm
Expand Down
26 changes: 8 additions & 18 deletions llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
Expand Up @@ -89,38 +89,28 @@ attributes #0 = { "target-cpu"="knl" }
; FORCE: vector.body:
; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; FORCE: pred.store.if:
; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1
; FORCE-NEXT: br label [[PRED_STORE_CONTINUE]]
; FORCE: pred.store.continue:
; FORCE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; FORCE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; FORCE: pred.store.if1:
; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1
; FORCE-NEXT: br label [[PRED_STORE_CONTINUE2]]
; FORCE: pred.store.continue2:
; FORCE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; FORCE: pred.load.if:
; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1
; FORCE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
; FORCE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
; FORCE-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]]
; FORCE: pred.load.continue:
; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
; FORCE: pred.load.if3:
; FORCE: pred.load.if1:
; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1
; FORCE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
; FORCE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
; FORCE-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]]
; FORCE: pred.load.continue4:
; FORCE: pred.load.continue2:
; FORCE-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
Expand Down

0 comments on commit 80aa7e1

Please sign in to comment.