Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LoopVectorize] Make interleaved-accesses analysis less conservative …
…about possible pointer-wrap-around concerns, in some cases. Before this patch, collectConstStridedAccesses (part of interleaved-accesses analysis) called getPtrStride with [Assume=false, ShouldCheckWrap=true] when examining all candidate pointers. This is too conservative. Instead, this patch makes collectConstStridedAccesses use an optimistic approach, calling getPtrStride with [Assume=true, ShouldCheckWrap=false], and then, once the candidate interleave groups have been formed, revisits the pointer-wrapping analysis but only where it matters: namely, in groups that have gaps, and where the gaps are not at the very end of the group (in which case the loop is peeled). This second time getPtrStride is called with [Assume=false, ShouldCheckWrap=true], but this could further be improved to using Assume=true, once we also add the logic to track that we are not going to meet the scev runtime checks threshold. Differential Revision: https://reviews.llvm.org/D25276 llvm-svn: 285517
- Loading branch information
Showing
7 changed files
with
255 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
78 changes: 78 additions & 0 deletions
78
llvm/test/Transforms/LoopVectorize/interleaved-accesses-1.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
|
||
; Check that the interleaved-mem-access analysis identifies the access | ||
; to array 'in' as interleaved, despite the possibly wrapping unsigned | ||
; 'out_ix' index. | ||
; | ||
; In this test the interleave-groups are full (have no gaps), so no wrapping | ||
; checks are necessary. We can call getPtrStride with Assume=false and | ||
; ShouldCheckWrap=false to safely figure out that the stride is 2. | ||
|
||
; #include <stdlib.h> | ||
; class Complex { | ||
; private: | ||
; float real_; | ||
; float imaginary_; | ||
; | ||
;public: | ||
; Complex() : real_(0), imaginary_(0) { } | ||
; Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { } | ||
; Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { } | ||
; | ||
; inline float real() const { return real_; } | ||
; inline float imaginary() const { return imaginary_; } | ||
;}; | ||
; | ||
;void test(Complex * __restrict__ out, Complex * __restrict__ in, size_t out_start, size_t size) | ||
;{ | ||
; for (size_t out_offset = 0; out_offset < size; ++out_offset) | ||
; { | ||
; size_t out_ix = out_start + out_offset; | ||
; Complex t0 = in[out_ix]; | ||
; out[out_ix] = t0; | ||
; } | ||
;} | ||
|
||
; CHECK: vector.body: | ||
; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 | ||
; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> | ||
; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> | ||
|
||
%class.Complex = type { float, float } | ||
|
||
define void @_Z4testP7ComplexS0_mm(%class.Complex* noalias nocapture %out, %class.Complex* noalias nocapture readonly %in, i64 %out_start, i64 %size) local_unnamed_addr { | ||
entry: | ||
%cmp9 = icmp eq i64 %size, 0 | ||
br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader | ||
|
||
for.body.preheader: | ||
br label %for.body | ||
|
||
for.cond.cleanup.loopexit: | ||
br label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%out_offset.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] | ||
%add = add i64 %out_offset.010, %out_start | ||
%arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %in, i64 %add | ||
%0 = bitcast %class.Complex* %arrayidx to i32* | ||
%1 = load i32, i32* %0, align 4 | ||
%imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %in, i64 %add, i32 1 | ||
%2 = bitcast float* %imaginary_.i.i to i32* | ||
%3 = load i32, i32* %2, align 4 | ||
%arrayidx1 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add | ||
%4 = bitcast %class.Complex* %arrayidx1 to i64* | ||
%t0.sroa.4.0.insert.ext = zext i32 %3 to i64 | ||
%t0.sroa.4.0.insert.shift = shl nuw i64 %t0.sroa.4.0.insert.ext, 32 | ||
%t0.sroa.0.0.insert.ext = zext i32 %1 to i64 | ||
%t0.sroa.0.0.insert.insert = or i64 %t0.sroa.4.0.insert.shift, %t0.sroa.0.0.insert.ext | ||
store i64 %t0.sroa.0.0.insert.insert, i64* %4, align 4 | ||
%inc = add nuw i64 %out_offset.010, 1 | ||
%exitcond = icmp eq i64 %inc, %size | ||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body | ||
} |
58 changes: 58 additions & 0 deletions
58
llvm/test/Transforms/LoopVectorize/interleaved-accesses-2.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
|
||
; Check that the interleaved-mem-access analysis currently does not create an | ||
; interleave group for the access to array 'in' due to the possibly wrapping | ||
; unsigned 'out_ix' index. | ||
; | ||
; In this test the interleave-group of the loads is not full (has gaps), so | ||
; the wrapping checks are necessary. Here this cannot be done statically so | ||
; runtime checks are needed, but with Assume=false getPtrStride cannot add | ||
; runtime checks and as a result we can't create the interleave-group. | ||
; | ||
; FIXME: This is currently a missed optimization until we can use Assume=true | ||
; with proper threshold checks. Once we do that the candidate interleave-group | ||
; will not be invalidated by the wrapping checks. | ||
|
||
; #include <stdlib.h> | ||
; void test(float * __restrict__ out, float * __restrict__ in, size_t size) | ||
; { | ||
; for (size_t out_offset = 0; out_offset < size; ++out_offset) | ||
; { | ||
; float t0 = in[2*out_offset]; | ||
; out[out_offset] = t0; | ||
; } | ||
; } | ||
|
||
; CHECK: vector.body: | ||
; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 | ||
; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> | ||
|
||
define void @_Z4testPfS_m(float* noalias nocapture %out, float* noalias nocapture readonly %in, i64 %size) local_unnamed_addr { | ||
entry: | ||
%cmp7 = icmp eq i64 %size, 0 | ||
br i1 %cmp7, label %for.cond.cleanup, label %for.body.preheader | ||
|
||
for.body.preheader: | ||
br label %for.body | ||
|
||
for.cond.cleanup.loopexit: | ||
br label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%out_offset.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] | ||
%mul = shl i64 %out_offset.08, 1 | ||
%arrayidx = getelementptr inbounds float, float* %in, i64 %mul | ||
%0 = bitcast float* %arrayidx to i32* | ||
%1 = load i32, i32* %0, align 4 | ||
%arrayidx1 = getelementptr inbounds float, float* %out, i64 %out_offset.08 | ||
%2 = bitcast float* %arrayidx1 to i32* | ||
store i32 %1, i32* %2, align 4 | ||
%inc = add nuw i64 %out_offset.08, 1 | ||
%exitcond = icmp eq i64 %inc, %size | ||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body | ||
} |
57 changes: 57 additions & 0 deletions
57
llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
|
||
; Check that the interleaved-mem-access analysis currently does not create an | ||
; interleave group for access 'a' due to the possible pointer wrap-around. | ||
; | ||
; To begin with, in this test the candidate interleave group can be created | ||
; only when getPtrStride is called with Assume=true. Next, because | ||
; the interleave-group of the loads is not full (has gaps), we also need to check | ||
; for possible pointer wrapping. Here we currently use Assume=false and as a | ||
; result cannot prove the transformation is safe and therefore invalidate the | ||
; candidate interleave group. | ||
; | ||
; FIXME: This is a missed optimization. Once we use Assume=true here, we will | ||
; not have to invalidate the group. | ||
|
||
; void func(unsigned * __restrict a, unsigned * __restrict b, unsigned char x, unsigned char y) { | ||
; int i = 0; | ||
; for (unsigned char index = x; i < y; index +=2, ++i) | ||
; b[i] = a[index] * 2; | ||
; | ||
; } | ||
|
||
; CHECK: vector.body: | ||
; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 | ||
; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> | ||
|
||
define void @_Z4funcPjS_hh(i32* noalias nocapture readonly %a, i32* noalias nocapture %b, i8 zeroext %x, i8 zeroext %y) local_unnamed_addr { | ||
entry: | ||
%cmp9 = icmp eq i8 %y, 0 | ||
br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader | ||
|
||
for.body.preheader: | ||
%wide.trip.count = zext i8 %y to i64 | ||
br label %for.body | ||
|
||
for.cond.cleanup.loopexit: | ||
br label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] | ||
%index.011 = phi i8 [ %add, %for.body ], [ %x, %for.body.preheader ] | ||
%idxprom = zext i8 %index.011 to i64 | ||
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom | ||
%0 = load i32, i32* %arrayidx, align 4 | ||
%mul = shl i32 %0, 1 | ||
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv | ||
store i32 %mul, i32* %arrayidx2, align 4 | ||
%add = add i8 %index.011, 2 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count | ||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body | ||
} |