[LV] Epilogue Vectorization with Optimal Control Flow

This is yet another attempt at providing support for epilogue vectorization following discussions raised in RFC http://llvm.1065342.n5.nabble.com/llvm-dev-Proposal-RFC-Epilog-loop-vectorization-tt106322.html#none and reviews D30247 and D88819. Similar to D88819, this patch achieve epilogue vectorization by executing a single vplan twice: once on the main loop and a second time on the epilogue loop (using a different VF). However it's able to handle more loops, and generates more optimal control flow for cases where the trip count is too small to execute any code in vector form. Reviewed By: SjoerdMeijer Differential Revision: https://reviews.llvm.org/D89566
llvm · Dec 1, 2020 · 9c5504a · 9c5504a
1 parent 843c2b2
commit 9c5504a
Show file tree

Hide file tree

Showing 12 changed files with 2,076 additions and 72 deletions.
diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst
@@ -370,6 +370,25 @@ to be used simultaneously.
 The Loop Vectorizer uses a cost model to decide when it is profitable to unroll loops.
 The decision to unroll the loop depends on the register pressure and the generated code size. 
 
+Epilogue Vectorization
+^^^^^^^^^^^^^^^^^^^^^^
+
+When vectorizing a loop, often a scalar remainder (epilogue) loop is necessary
+to execute tail iterations of the loop if the loop trip count is unknown or it
+does not evenly divide the vectorization and unroll factors. When the
+vectorization and unroll factors are large, it's possible for loops with smaller
+trip counts to end up spending most of their time in the scalar (rather than
+the vector) code. In order to address this issue, the inner loop vectorizer is
+enhanced with a feature that allows it to vectorize epilogue loops with a
+vectorization and unroll factor combination that makes it more likely for small
+trip count loops to still execute in vectorized code. The diagram below shows
+the CFG for a typical epilogue vectorized loop with runtime checks. As
+illustrated the control flow is structured in a way that avoids duplicating the
+runtime pointer checks and optimizes the path length for loops that have very
+small trip counts.
+
+.. image:: epilogue-vectorization-cfg.png
+
 Performance
 -----------
 

diff --git a/llvm/docs/epilogue-vectorization-cfg.png b/llvm/docs/epilogue-vectorization-cfg.png
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,10 @@ struct VectorizationFactor {
   bool operator==(const VectorizationFactor &rhs) const {
     return Width == rhs.Width && Cost == rhs.Cost;
   }
+
+  bool operator!=(const VectorizationFactor &rhs) const {
+    return !(*this == rhs);
+  }
 };
 
 /// Planner drives the vectorization process after having passed
@@ -265,6 +269,18 @@ class LoopVectorizationPlanner {
       O << *Plan;
   }
 
+  /// Look through the existing plans and return true if we have one with all
+  /// the vectorization factors in question.
+  bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
+    return any_of(VPlans, [&](const VPlanPtr &Plan) {
+      return all_of(VFs, [&](const ElementCount &VF) {
+        if (Plan->hasVF(VF))
+          return true;
+        return false;
+      });
+    });
+  }
+
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
   /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
   /// returned value holds for the entire \p Range.

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
@@ -0,0 +1,133 @@
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s
+
+; TODO: For now test for the `-epilogue-vectorization-minimum-VF` option. In
+; the future we need to replace this with a more meaningful test of the
+; epilogue vectorization cost-model.
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Do not vectorize epilogues for loops with minsize attribute
+; CHECK-LABLE: @f1
+; CHECK-NOT: vector.main.loop.iter.check
+; CHECK-NOT: vec.epilog.iter.check
+; CHECK-NOT: vec.epilog.ph
+; CHECK-NOT: vec.epilog.vector.body
+; CHECK-NOT: vec.epilog.middle.block
+
+define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 {
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Do not vectorize epilogues for loops with optsize attribute
+; CHECK-LABLE: @f2
+; CHECK-NOT: vector.main.loop.iter.check
+; CHECK-NOT: vec.epilog.iter.check
+; CHECK-NOT: vec.epilog.ph
+; CHECK-NOT: vec.epilog.vector.body
+; CHECK-NOT: vec.epilog.middle.block
+
+define dso_local void @f2(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #1 {
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16.
+; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-D-NOT: vector.main.loop.iter.check
+; CHECK-MIN-D-NOT: vec.epilog.iter.check
+; CHECK-MIN-D-NOT: vec.epilog.ph
+; CHECK-MIN-D-NOT: vec.epilog.vector.body
+; CHECK-MIN-D-NOT: vec.epilog.middle.block
+
+; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and
+; make sure the epilogue gets vectorized in that case.
+; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-4: vector.main.loop.iter.check
+; CHECK-MIN-4: vec.epilog.iter.check
+; CHECK-MIN-4: vec.epilog.ph
+; CHECK-MIN-4: vec.epilog.vector.body
+; CHECK-MIN-4: vec.epilog.middle.block
+
+define dso_local void @f3(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) {
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { minsize }
+attributes #1 = { optsize }