87 changes: 84 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include <vector>

using namespace llvm;
extern cl::opt<bool> EnableVPlanNativePath;

#define DEBUG_TYPE "vplan"

Expand Down Expand Up @@ -124,6 +125,20 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
auto &PredVPSuccessors = PredVPBB->getSuccessors();
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];

// In outer loop vectorization scenario, the predecessor BBlock may not yet
// be visited(backedge). Mark the VPBasicBlock for fixup at the end of
// vectorization. We do not encounter this case in inner loop vectorization
// as we start out by building a loop skeleton with the vector loop header
// and latch blocks. As a result, we never enter this function for the
// header block in the non VPlan-native path.
if (!PredBB) {
assert(EnableVPlanNativePath &&
"Unexpected null predecessor in non VPlan-native path");
CFG.VPBBsToFix.push_back(PredVPBB);
continue;
}

assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
Expand Down Expand Up @@ -185,6 +200,35 @@ void VPBasicBlock::execute(VPTransformState *State) {
for (VPRecipeBase &Recipe : Recipes)
Recipe.execute(*State);

VPValue *CBV;
if (EnableVPlanNativePath && (CBV = getCondBit())) {
Value *IRCBV = CBV->getUnderlyingValue();
assert(IRCBV && "Unexpected null underlying value for condition bit");

// Delete the condition bit at this point - it should be no longer needed.
delete CBV;
setCondBit(nullptr);

// Condition bit value in a VPBasicBlock is used as the branch selector. In
// the VPlan-native path case, since all branches are uniform we generate a
// branch instruction using the condition value from vector lane 0 and dummy
// successors. The successors are fixed later when the successor blocks are
// visited.
Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0);
NewCond = State->Builder.CreateExtractElement(NewCond,
State->Builder.getInt32(0));

// Replace the temporary unreachable terminator with the new conditional
// branch.
auto *CurrentTerminator = NewBB->getTerminator();
assert(isa<UnreachableInst>(CurrentTerminator) &&
"Expected to replace unreachable terminator with conditional "
"branch.");
auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
CondBr->setSuccessor(0, nullptr);
ReplaceInstWithInst(CurrentTerminator, CondBr);
}

LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
}

Expand All @@ -194,6 +238,20 @@ void VPRegionBlock::execute(VPTransformState *State) {
if (!isReplicator()) {
// Visit the VPBlocks connected to "this", starting from it.
for (VPBlockBase *Block : RPOT) {
if (EnableVPlanNativePath) {
// The inner loop vectorization path does not represent loop preheader
// and exit blocks as part of the VPlan. In the VPlan-native path, skip
// vectorizing loop preheader block. In future, we may replace this
// check with the check for loop preheader.
if (Block->getNumPredecessors() == 0)
continue;

// Skip vectorizing loop exit block. In future, we may replace this
// check with the check for loop exit.
if (Block->getNumSuccessors() == 0)
continue;
}

LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
Expand Down Expand Up @@ -319,11 +377,32 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : depth_first(Entry))
Block->execute(State);

// Setup branch terminator successors for VPBBs in VPBBsToFix based on
// VPBB's successors.
for (auto VPBB : State->CFG.VPBBsToFix) {
assert(EnableVPlanNativePath &&
"Unexpected VPBBsToFix in non VPlan-native path");
BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
assert(BB && "Unexpected null basic block for VPBB");

unsigned Idx = 0;
auto *BBTerminator = BB->getTerminator();

for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
++Idx;
}
}

// 3. Merge the temporary latch created with the last basic-block filled.
BasicBlock *LastBB = State->CFG.PrevBB;
// Connect LastBB to VectorLatchBB to facilitate their merge.
assert(isa<UnreachableInst>(LastBB->getTerminator()) &&
"Expected VPlan CFG to terminate with unreachable");
assert((EnableVPlanNativePath ||
isa<UnreachableInst>(LastBB->getTerminator())) &&
"Expected InnerLoop VPlan CFG to terminate with unreachable");
assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
"Expected VPlan CFG to terminate with branch in NativePath");
LastBB->getTerminator()->eraseFromParent();
BranchInst::Create(VectorLatchBB, LastBB);

Expand All @@ -333,7 +412,9 @@ void VPlan::execute(VPTransformState *State) {
assert(Merged && "Could not merge last basic block with latch.");
VectorLatchBB = LastBB;

updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
// We do not attempt to preserve DT for outer loop vectorization currently.
if (!EnableVPlanNativePath)
updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
}

void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,10 @@ struct VPTransformState {
/// of replication, maps the BasicBlock of the last replica created.
SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;

/// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
/// up at the end of vector code generation.
SmallVector<VPBasicBlock *, 8> VPBBsToFix;

CFGState() = default;
} CFG;

Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(

VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());

// Condition bit VPValues get deleted during transformation to VPRecipes.
// Create new VPValues and save away as condition bits. These will be deleted
// after finalizing the vector IR basic blocks.
for (VPBlockBase *Base : RPOT) {
VPBasicBlock *VPBB = Base->getEntryBasicBlock();
if (auto *CondBit = VPBB->getCondBit()) {
auto *NCondBit = new VPValue(CondBit->getUnderlyingValue());
VPBB->setCondBit(NCondBit);
}
}
for (VPBlockBase *Base : RPOT) {
// Do not widen instructions in pre-header and exit blocks.
if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanValue.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class VPUser;
// and live-outs which the VPlan will need to fix accordingly.
class VPValue {
friend class VPBuilder;
friend class VPlanHCFGTransforms;
friend class VPBasicBlock;

private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).

Expand Down
82 changes: 82 additions & 0 deletions llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
; extern int arr[8][8];
; extern int arr2[8];
;
; void foo(int n)
; {
; int i1, i2;
;
; #pragma clang loop vectorize(enable) vectorize_width(4)
; for (i1 = 0; i1 < 8; i1++) {
; arr2[i1] = i1;
; for (i2 = 0; i2 < 8; i2++)
; arr[i2][i1] = i1 + n;
; }
; }
;
; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
; CHECK-LABEL: vector.ph:
; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer

; CHECK-LABEL: vector.body:
; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
; CHECK: br label %[[InnerLoop:.+]]

; CHECK: [[InnerLoop]]:
; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]

; CHECK: [[ForInc]]:
; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body

@arr2 = external global [8 x i32], align 16
@arr = external global [8 x [8 x i32]], align 16

; Function Attrs: norecurse nounwind uwtable
define void @foo(i32 %n) {
entry:
br label %for.body

for.body: ; preds = %for.inc8, %entry
%indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
%0 = trunc i64 %indvars.iv21 to i32
store i32 %0, i32* %arrayidx, align 4
%1 = trunc i64 %indvars.iv21 to i32
%add = add nsw i32 %1, %n
br label %for.body3

for.body3: ; preds = %for.body3, %for.body
%indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
%arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
store i32 %add, i32* %arrayidx7, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 8
br i1 %exitcond, label %for.inc8, label %for.body3

for.inc8: ; preds = %for.body3
%indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
%exitcond23 = icmp eq i64 %indvars.iv.next22, 8
br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1

for.end10: ; preds = %for.inc8
ret void
}

!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.enable", i1 true}
112 changes: 112 additions & 0 deletions llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
; int A[1024], B[1024];
;
; void foo(int iCount, int c, int jCount)
; {
;
; int i, j;
;
; #pragma clang loop vectorize(enable) vectorize_width(4)
; for (i = 0; i < iCount; i++) {
; A[i] = c;
; for (j = 0; j < jCount; j++) {
; A[i] += B[j] + i;
; }
; }
; }
; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
; CHECK: %[[ZeroTripChk:.*]] = icmp sgt i32 %jCount, 0
; CHECK-LABEL: vector.ph:
; CHECK: %[[CVal0:.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
; CHECK-NEXT: %[[CSplat:.*]] = shufflevector <4 x i32> %[[CVal0]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK: %[[ZVal0:.*]] = insertelement <4 x i1> undef, i1 %[[ZeroTripChk]], i32 0
; CHECK-NEXT: %[[ZSplat:.*]] = shufflevector <4 x i1> %[[ZVal0]], <4 x i1> undef, <4 x i32> zeroinitializer

; CHECK-LABEL: vector.body:
; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, <4 x i64> %[[VecInd]]
; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[CSplat]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
; CHECK: %[[ZCmpExtr:.*]] = extractelement <4 x i1> %[[ZSplat]], i32 0
; CHECK: br i1 %[[ZCmpExtr]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]

; CHECK: [[InnerForPh]]:
; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
; CHECK: br label %[[InnerForBody:.*]]

; CHECK: [[InnerForBody]]:
; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ]
; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ]
; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]]
; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], <i64 1, i64 1, i64 1, i64 1>
; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}}
; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0
; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]]

; CHECK: [[InnerCrit]]:
; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StorePhi]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
; CHECK: br label %[[ForInc]]

; CHECK: [[ForInc]]:
; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], {{.*}}
; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body

@A = common global [1024 x i32] zeroinitializer, align 16
@B = common global [1024 x i32] zeroinitializer, align 16

; Function Attrs: norecurse nounwind uwtable
define void @foo(i32 %iCount, i32 %c, i32 %jCount) {
entry:
%cmp22 = icmp sgt i32 %iCount, 0
br i1 %cmp22, label %for.body.lr.ph, label %for.end11

for.body.lr.ph: ; preds = %entry
%cmp220 = icmp sgt i32 %jCount, 0
%wide.trip.count = zext i32 %jCount to i64
%wide.trip.count27 = zext i32 %iCount to i64
br label %for.body

for.body: ; preds = %for.inc9, %for.body.lr.ph
%indvars.iv25 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next26, %for.inc9 ]
%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv25
store i32 %c, i32* %arrayidx, align 4
br i1 %cmp220, label %for.body3.lr.ph, label %for.inc9

for.body3.lr.ph: ; preds = %for.body
%arrayidx.promoted = load i32, i32* %arrayidx, align 4
%0 = trunc i64 %indvars.iv25 to i32
br label %for.body3

for.body3: ; preds = %for.body3, %for.body3.lr.ph
%indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
%1 = phi i32 [ %arrayidx.promoted, %for.body3.lr.ph ], [ %add8, %for.body3 ]
%arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
%2 = load i32, i32* %arrayidx5, align 4
%add = add nsw i32 %2, %0
%add8 = add nsw i32 %add, %1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond1.for.inc9_crit_edge, label %for.body3

for.cond1.for.inc9_crit_edge: ; preds = %for.body3
store i32 %add8, i32* %arrayidx, align 4
br label %for.inc9

for.inc9: ; preds = %for.cond1.for.inc9_crit_edge, %for.body
%indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
%exitcond28 = icmp eq i64 %indvars.iv.next26, %wide.trip.count27
br i1 %exitcond28, label %for.end11, label %for.body, !llvm.loop !1

for.end11: ; preds = %for.inc9, %entry
ret void
}

!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.enable", i1 true}