Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LV] Epilogue Vectorization with Optimal Control Flow
This is yet another attempt at providing support for epilogue vectorization following discussions raised in RFC http://llvm.1065342.n5.nabble.com/llvm-dev-Proposal-RFC-Epilog-loop-vectorization-tt106322.html#none and reviews D30247 and D88819. Similar to D88819, this patch achieve epilogue vectorization by executing a single vplan twice: once on the main loop and a second time on the epilogue loop (using a different VF). However it's able to handle more loops, and generates more optimal control flow for cases where the trip count is too small to execute any code in vector form. Reviewed By: SjoerdMeijer Differential Revision: https://reviews.llvm.org/D89566
- Loading branch information
Showing
12 changed files
with
2,076 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
133 changes: 133 additions & 0 deletions
133
llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s | ||
|
||
; TODO: For now test for the `-epilogue-vectorization-minimum-VF` option. In | ||
; the future we need to replace this with a more meaningful test of the | ||
; epilogue vectorization cost-model. | ||
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4 | ||
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D | ||
|
||
target datalayout = "e-m:e-i64:64-n32:64" | ||
target triple = "powerpc64le-unknown-linux-gnu" | ||
|
||
; Do not vectorize epilogues for loops with minsize attribute | ||
; CHECK-LABLE: @f1 | ||
; CHECK-NOT: vector.main.loop.iter.check | ||
; CHECK-NOT: vec.epilog.iter.check | ||
; CHECK-NOT: vec.epilog.ph | ||
; CHECK-NOT: vec.epilog.vector.body | ||
; CHECK-NOT: vec.epilog.middle.block | ||
|
||
define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 { | ||
entry: | ||
%cmp1 = icmp sgt i32 %N, 0 | ||
br i1 %cmp1, label %for.body.preheader, label %for.end | ||
|
||
for.body.preheader: ; preds = %entry | ||
%wide.trip.count = zext i32 %N to i64 | ||
br label %for.body | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] | ||
%arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv | ||
%0 = load float, float* %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv | ||
%1 = load float, float* %arrayidx2, align 4 | ||
%add = fadd fast float %0, %1 | ||
%arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv | ||
store float %add, float* %arrayidx4, align 4 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count | ||
br i1 %exitcond, label %for.body, label %for.end.loopexit | ||
|
||
for.end.loopexit: ; preds = %for.body | ||
br label %for.end | ||
|
||
for.end: ; preds = %for.end.loopexit, %entry | ||
ret void | ||
} | ||
|
||
; Do not vectorize epilogues for loops with optsize attribute | ||
; CHECK-LABLE: @f2 | ||
; CHECK-NOT: vector.main.loop.iter.check | ||
; CHECK-NOT: vec.epilog.iter.check | ||
; CHECK-NOT: vec.epilog.ph | ||
; CHECK-NOT: vec.epilog.vector.body | ||
; CHECK-NOT: vec.epilog.middle.block | ||
|
||
define dso_local void @f2(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #1 { | ||
entry: | ||
%cmp1 = icmp sgt i32 %N, 0 | ||
br i1 %cmp1, label %for.body.preheader, label %for.end | ||
|
||
for.body.preheader: ; preds = %entry | ||
%wide.trip.count = zext i32 %N to i64 | ||
br label %for.body | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] | ||
%arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv | ||
%0 = load float, float* %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv | ||
%1 = load float, float* %arrayidx2, align 4 | ||
%add = fadd fast float %0, %1 | ||
%arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv | ||
store float %add, float* %arrayidx4, align 4 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count | ||
br i1 %exitcond, label %for.body, label %for.end.loopexit | ||
|
||
for.end.loopexit: ; preds = %for.body | ||
br label %for.end | ||
|
||
for.end: ; preds = %for.end.loopexit, %entry | ||
ret void | ||
} | ||
|
||
; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16. | ||
; CHECK-MIN-D-LABLE: @f3 | ||
; CHECK-MIN-D-NOT: vector.main.loop.iter.check | ||
; CHECK-MIN-D-NOT: vec.epilog.iter.check | ||
; CHECK-MIN-D-NOT: vec.epilog.ph | ||
; CHECK-MIN-D-NOT: vec.epilog.vector.body | ||
; CHECK-MIN-D-NOT: vec.epilog.middle.block | ||
|
||
; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and | ||
; make sure the epilogue gets vectorized in that case. | ||
; CHECK-MIN-D-LABLE: @f3 | ||
; CHECK-MIN-4: vector.main.loop.iter.check | ||
; CHECK-MIN-4: vec.epilog.iter.check | ||
; CHECK-MIN-4: vec.epilog.ph | ||
; CHECK-MIN-4: vec.epilog.vector.body | ||
; CHECK-MIN-4: vec.epilog.middle.block | ||
|
||
define dso_local void @f3(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) { | ||
entry: | ||
%cmp1 = icmp sgt i32 %N, 0 | ||
br i1 %cmp1, label %for.body.preheader, label %for.end | ||
|
||
for.body.preheader: ; preds = %entry | ||
%wide.trip.count = zext i32 %N to i64 | ||
br label %for.body | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] | ||
%arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv | ||
%0 = load float, float* %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv | ||
%1 = load float, float* %arrayidx2, align 4 | ||
%add = fadd fast float %0, %1 | ||
%arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv | ||
store float %add, float* %arrayidx4, align 4 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count | ||
br i1 %exitcond, label %for.body, label %for.end.loopexit | ||
|
||
for.end.loopexit: ; preds = %for.body | ||
br label %for.end | ||
|
||
for.end: ; preds = %for.end.loopexit, %entry | ||
ret void | ||
} | ||
|
||
attributes #0 = { minsize } | ||
attributes #1 = { optsize } |
Oops, something went wrong.