Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LV/LoopAccess] Check statically if an unknown dependence distance ca…
…n be proven larger than the loop-count This fixes PR31098: Try to resolve statically data-dependences whose compile-time-unknown distance can be proven larger than the loop-count, instead of resorting to runtime dependence checking (which are not always possible). For vectorization it is sufficient to prove that the dependence distance is >= VF; But in some cases we can prune unknown dependence distances early, and even before selecting the VF, and without a runtime test, by comparing the distance against the loop iteration count. Since the vectorized code will be executed only if LoopCount >= VF, proving distance >= LoopCount also guarantees that distance >= VF. This check is also equivalent to the Strong SIV Test. Reviewers: mkuper, anemet, sanjoy Differential Revision: https://reviews.llvm.org/D28044 llvm-svn: 294892
- Loading branch information
Showing
5 changed files
with
285 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
; RUN: opt -loop-accesses -analyze < %s | FileCheck %s | ||
; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
|
||
; Check that the compile-time-unknown depenendece-distance is resolved | ||
; statically. Due to the non-unit stride of the accesses in this testcase | ||
; we are currently not able to create runtime dependence checks, and therefore | ||
; if we don't resolve the dependence statically we cannot vectorize the loop. | ||
; | ||
; Specifically in this example, during dependence analysis we get 6 unknown | ||
; dependence distances between the 8 real/imaginary accesses below: | ||
; dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D. | ||
; At compile time we can prove for all of the above that |dist|>loopBound*step | ||
; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude | ||
; that there are no dependencies (without runtime tests): | ||
; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc. | ||
|
||
; #include <stdlib.h> | ||
; class Complex { | ||
; private: | ||
; float real_; | ||
; float imaginary_; | ||
; | ||
; public: | ||
; Complex() : real_(0), imaginary_(0) { } | ||
; Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { } | ||
; Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { } | ||
; | ||
; inline float real() const { return real_; } | ||
; inline float imaginary() const { return imaginary_; } | ||
; | ||
; Complex operator+(const Complex& rhs) const | ||
; { | ||
; return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_); | ||
; } | ||
; | ||
; Complex operator-(const Complex& rhs) const | ||
; { | ||
; return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_); | ||
; } | ||
; }; | ||
; | ||
; void Test(Complex *out, size_t size) | ||
; { | ||
; size_t D = size / 2; | ||
; for (size_t offset = 0; offset < D; ++offset) | ||
; { | ||
; Complex t0 = out[offset]; | ||
; Complex t1 = out[offset + D]; | ||
; out[offset] = t1 + t0; | ||
; out[offset + D] = t0 - t1; | ||
; } | ||
; } | ||
|
||
; CHECK-LABEL: Test | ||
; CHECK: Memory dependences are safe | ||
|
||
|
||
%class.Complex = type { float, float } | ||
|
||
define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr { | ||
entry: | ||
%div = lshr i64 %size, 1 | ||
%cmp47 = icmp eq i64 %div, 0 | ||
br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader | ||
|
||
for.body.preheader: | ||
br label %for.body | ||
|
||
for.cond.cleanup.loopexit: | ||
br label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] | ||
%0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0 | ||
%1 = load float, float* %0, align 4 | ||
%imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1 | ||
%2 = load float, float* %imaginary_.i.i, align 4 | ||
%add = add nuw i64 %offset.048, %div | ||
%3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0 | ||
%4 = load float, float* %3, align 4 | ||
%imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1 | ||
%5 = load float, float* %imaginary_.i.i28, align 4 | ||
%add.i = fadd fast float %4, %1 | ||
%add4.i = fadd fast float %5, %2 | ||
store float %add.i, float* %0, align 4 | ||
store float %add4.i, float* %imaginary_.i.i, align 4 | ||
%sub.i = fsub fast float %1, %4 | ||
%sub4.i = fsub fast float %2, %5 | ||
store float %sub.i, float* %3, align 4 | ||
store float %sub4.i, float* %imaginary_.i.i28, align 4 | ||
%inc = add nuw nsw i64 %offset.048, 1 | ||
%exitcond = icmp eq i64 %inc, %div | ||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
; REQUIRES: asserts | ||
; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -debug-only=loop-accesses < %s 2>&1 | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
|
||
; Check that the compile-time-unknown depenendece-distance is resolved | ||
; statically. Due to the non-unit stride of the accesses in this testcase | ||
; we are currently not able to create runtime dependence checks, and therefore | ||
; if we don't resolve the dependence statically we cannot vectorize the loop. | ||
; | ||
; Specifically in this example, during dependence analysis we get 6 unknown | ||
; dependence distances between the 8 real/imaginary accesses below: | ||
; dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D. | ||
; At compile time we can prove for all of the above that |dist|>loopBound*step | ||
; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude | ||
; that there are no dependencies (without runtime tests): | ||
; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc. | ||
|
||
; #include <stdlib.h> | ||
; class Complex { | ||
; private: | ||
; float real_; | ||
; float imaginary_; | ||
; | ||
; public: | ||
; Complex() : real_(0), imaginary_(0) { } | ||
; Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { } | ||
; Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { } | ||
; | ||
; inline float real() const { return real_; } | ||
; inline float imaginary() const { return imaginary_; } | ||
; | ||
; Complex operator+(const Complex& rhs) const | ||
; { | ||
; return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_); | ||
; } | ||
; | ||
; Complex operator-(const Complex& rhs) const | ||
; { | ||
; return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_); | ||
; } | ||
; }; | ||
; | ||
; void Test(Complex *out, size_t size) | ||
; { | ||
; size_t D = size / 2; | ||
; for (size_t offset = 0; offset < D; ++offset) | ||
; { | ||
; Complex t0 = out[offset]; | ||
; Complex t1 = out[offset + D]; | ||
; out[offset] = t1 + t0; | ||
; out[offset + D] = t0 - t1; | ||
; } | ||
; } | ||
|
||
; CHECK-LABEL: Test | ||
; CHECK: LAA: No unsafe dependent memory operations in loop. We don't need runtime memory checks. | ||
; CHECK: vector.body: | ||
; CHECK: <4 x i32> | ||
|
||
%class.Complex = type { float, float } | ||
|
||
define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr { | ||
entry: | ||
%div = lshr i64 %size, 1 | ||
%cmp47 = icmp eq i64 %div, 0 | ||
br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader | ||
|
||
for.body.preheader: | ||
br label %for.body | ||
|
||
for.cond.cleanup.loopexit: | ||
br label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
|
||
for.body: | ||
%offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] | ||
%0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0 | ||
%1 = load float, float* %0, align 4 | ||
%imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1 | ||
%2 = load float, float* %imaginary_.i.i, align 4 | ||
%add = add nuw i64 %offset.048, %div | ||
%3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0 | ||
%4 = load float, float* %3, align 4 | ||
%imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1 | ||
%5 = load float, float* %imaginary_.i.i28, align 4 | ||
%add.i = fadd fast float %4, %1 | ||
%add4.i = fadd fast float %5, %2 | ||
store float %add.i, float* %0, align 4 | ||
store float %add4.i, float* %imaginary_.i.i, align 4 | ||
%sub.i = fsub fast float %1, %4 | ||
%sub4.i = fsub fast float %2, %5 | ||
store float %sub.i, float* %3, align 4 | ||
store float %sub4.i, float* %imaginary_.i.i28, align 4 | ||
%inc = add nuw nsw i64 %offset.048, 1 | ||
%exitcond = icmp eq i64 %inc, %div | ||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body | ||
} |