Skip to content

Commit

Permalink
[LV] Epilogue Vectorization with Optimal Control Flow
Browse files Browse the repository at this point in the history
This is yet another attempt at providing support for epilogue
vectorization following discussions raised in RFC http://llvm.1065342.n5.nabble.com/llvm-dev-Proposal-RFC-Epilog-loop-vectorization-tt106322.html#none
and reviews D30247 and D88819.

Similar to D88819, this patch achieve epilogue vectorization by
executing a single vplan twice: once on the main loop and a second
time on the epilogue loop (using a different VF). However it's able
to handle more loops, and generates more optimal control flow for
cases where the trip count is too small to execute any code in vector
form.

Reviewed By: SjoerdMeijer

Differential Revision: https://reviews.llvm.org/D89566
  • Loading branch information
bmahjour committed Dec 1, 2020
1 parent 843c2b2 commit 9c5504a
Show file tree
Hide file tree
Showing 12 changed files with 2,076 additions and 72 deletions.
19 changes: 19 additions & 0 deletions llvm/docs/Vectorizers.rst
Expand Up @@ -370,6 +370,25 @@ to be used simultaneously.
The Loop Vectorizer uses a cost model to decide when it is profitable to unroll loops.
The decision to unroll the loop depends on the register pressure and the generated code size.

Epilogue Vectorization
^^^^^^^^^^^^^^^^^^^^^^

When vectorizing a loop, often a scalar remainder (epilogue) loop is necessary
to execute tail iterations of the loop if the loop trip count is unknown or it
does not evenly divide the vectorization and unroll factors. When the
vectorization and unroll factors are large, it's possible for loops with smaller
trip counts to end up spending most of their time in the scalar (rather than
the vector) code. In order to address this issue, the inner loop vectorizer is
enhanced with a feature that allows it to vectorize epilogue loops with a
vectorization and unroll factor combination that makes it more likely for small
trip count loops to still execute in vectorized code. The diagram below shows
the CFG for a typical epilogue vectorized loop with runtime checks. As
illustrated the control flow is structured in a way that avoids duplicating the
runtime pointer checks and optimizes the path length for loops that have very
small trip counts.

.. image:: epilogue-vectorization-cfg.png

Performance
-----------

Expand Down
Binary file added llvm/docs/epilogue-vectorization-cfg.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 16 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Expand Up @@ -184,6 +184,10 @@ struct VectorizationFactor {
bool operator==(const VectorizationFactor &rhs) const {
return Width == rhs.Width && Cost == rhs.Cost;
}

bool operator!=(const VectorizationFactor &rhs) const {
return !(*this == rhs);
}
};

/// Planner drives the vectorization process after having passed
Expand Down Expand Up @@ -265,6 +269,18 @@ class LoopVectorizationPlanner {
O << *Plan;
}

/// Look through the existing plans and return true if we have one with all
/// the vectorization factors in question.
bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
return any_of(VPlans, [&](const VPlanPtr &Plan) {
return all_of(VFs, [&](const ElementCount &VF) {
if (Plan->hasVF(VF))
return true;
return false;
});
});
}

/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
/// \p Predicate on Range.Start, possibly decreasing Range.End such that the
/// returned value holds for the entire \p Range.
Expand Down
646 changes: 631 additions & 15 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Large diffs are not rendered by default.

@@ -0,0 +1,133 @@
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s

; TODO: For now test for the `-epilogue-vectorization-minimum-VF` option. In
; the future we need to replace this with a more meaningful test of the
; epilogue vectorization cost-model.
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D

target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"

; Do not vectorize epilogues for loops with minsize attribute
; CHECK-LABLE: @f1
; CHECK-NOT: vector.main.loop.iter.check
; CHECK-NOT: vec.epilog.iter.check
; CHECK-NOT: vec.epilog.ph
; CHECK-NOT: vec.epilog.vector.body
; CHECK-NOT: vec.epilog.middle.block

define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 {
entry:
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.body.preheader, label %for.end

for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4
%add = fadd fast float %0, %1
%arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
store float %add, float* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.end.loopexit

for.end.loopexit: ; preds = %for.body
br label %for.end

for.end: ; preds = %for.end.loopexit, %entry
ret void
}

; Do not vectorize epilogues for loops with optsize attribute
; CHECK-LABLE: @f2
; CHECK-NOT: vector.main.loop.iter.check
; CHECK-NOT: vec.epilog.iter.check
; CHECK-NOT: vec.epilog.ph
; CHECK-NOT: vec.epilog.vector.body
; CHECK-NOT: vec.epilog.middle.block

define dso_local void @f2(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #1 {
entry:
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.body.preheader, label %for.end

for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4
%add = fadd fast float %0, %1
%arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
store float %add, float* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.end.loopexit

for.end.loopexit: ; preds = %for.body
br label %for.end

for.end: ; preds = %for.end.loopexit, %entry
ret void
}

; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16.
; CHECK-MIN-D-LABLE: @f3
; CHECK-MIN-D-NOT: vector.main.loop.iter.check
; CHECK-MIN-D-NOT: vec.epilog.iter.check
; CHECK-MIN-D-NOT: vec.epilog.ph
; CHECK-MIN-D-NOT: vec.epilog.vector.body
; CHECK-MIN-D-NOT: vec.epilog.middle.block

; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and
; make sure the epilogue gets vectorized in that case.
; CHECK-MIN-D-LABLE: @f3
; CHECK-MIN-4: vector.main.loop.iter.check
; CHECK-MIN-4: vec.epilog.iter.check
; CHECK-MIN-4: vec.epilog.ph
; CHECK-MIN-4: vec.epilog.vector.body
; CHECK-MIN-4: vec.epilog.middle.block

define dso_local void @f3(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) {
entry:
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.body.preheader, label %for.end

for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4
%add = fadd fast float %0, %1
%arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
store float %add, float* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.end.loopexit

for.end.loopexit: ; preds = %for.body
br label %for.end

for.end: ; preds = %for.end.loopexit, %entry
ret void
}

attributes #0 = { minsize }
attributes #1 = { optsize }

0 comments on commit 9c5504a

Please sign in to comment.