Skip to content

Commit

Permalink
Merging r279930:
Browse files Browse the repository at this point in the history
------------------------------------------------------------------------
r279930 | elena.demikhovsky | 2016-08-28 01:53:53 -0700 (Sun, 28 Aug 2016) | 7 lines

[Loop Vectorizer] Fixed memory confilict checks.

Fixed a bug in run-time checks for possible memory conflicts inside loop.
The bug is in Low <-> High boundaries calculation. The High boundary
should be calculated as "last memory access pointer + element size".

Differential revision: https://reviews.llvm.org/D23176

------------------------------------------------------------------------

llvm-svn: 287779
  • Loading branch information
tstellarAMD committed Nov 23, 2016
1 parent 38f95bb commit 6f9836e
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 30 deletions.
6 changes: 4 additions & 2 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Expand Up @@ -334,9 +334,11 @@ class RuntimePointerChecking {
struct PointerInfo {
/// Holds the pointer value that we need to check.
TrackingVH<Value> PointerValue;
/// Holds the pointer value at the beginning of the loop.
/// Holds the smallest byte address accessed by the pointer throughout all
/// iterations of the loop.
const SCEV *Start;
/// Holds the pointer value at the end of the loop.
/// Holds the largest byte address accessed by the pointer throughout all
/// iterations of the loop, plus 1.
const SCEV *End;
/// Holds the information if this pointer is used for writing to memory.
bool IsWritePtr;
Expand Down
32 changes: 29 additions & 3 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Expand Up @@ -148,6 +148,19 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
return OrigSCEV;
}

/// Calculate Start and End points of memory access.
/// Let's assume A is the first access and B is a memory access on N-th loop
/// iteration. Then B is calculated as:
/// B = A + Step*N .
/// Step value may be positive or negative.
/// N is a calculated back-edge taken count:
/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
/// Start and End points are calculated in the following way:
/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
/// where SizeOfElt is the size of single memory access in bytes.
///
/// There is no conflict when the intervals are disjoint:
/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
unsigned DepSetId, unsigned ASId,
const ValueToValueMap &Strides,
Expand Down Expand Up @@ -176,12 +189,17 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
if (CStep->getValue()->isNegative())
std::swap(ScStart, ScEnd);
} else {
// Fallback case: the step is not constant, but the we can still
// Fallback case: the step is not constant, but we can still
// get the upper and lower bounds of the interval by using min/max
// expressions.
ScStart = SE->getUMinExpr(ScStart, ScEnd);
ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
}
// Add the size of the pointed element to ScEnd.
unsigned EltSize =
Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
}

Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
Expand Down Expand Up @@ -1863,9 +1881,17 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");

Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
// [A|B].Start points to the first accessed byte under base [A|B].
// [A|B].End points to the last accessed byte, plus one.
// There is no conflict when the intervals are disjoint:
// NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
//
// bound0 = (B.Start < A.End)
// bound1 = (A.Start < B.End)
// IsConflict = bound0 & bound1
Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
Expand Down
51 changes: 51 additions & 0 deletions llvm/test/Analysis/LoopAccessAnalysis/memcheck-off-by-one-error.ll
@@ -0,0 +1,51 @@
; RUN: opt -analyze --loop-accesses %s | FileCheck %s

; This test verifies run-time boundary check of memory accesses.
; The original loop:
; void fastCopy(const char* src, char* op) {
; int len = 32;
; while (len > 0) {
; *(reinterpret_cast<long long*>(op)) = *(reinterpret_cast<const long long*>(src));
; src += 8;
; op += 8;
; len -= 8;
; }
; }
; Boundaries calculations before this patch:
; (Low: %src High: (24 + %src))
; and the actual distance between two pointers was 31, (%op - %src = 31)
; IsConflict = (24 > 31) = false -> execution is directed to the vectorized loop.
; The loop was vectorized to 4, 32 byte memory access ( <4 x i64> ),
; store a value at *%op touched memory under *%src.

;CHECK: Printing analysis 'Loop Access Analysis' for function 'fastCopy'
;CHECK: (Low: %op High: (32 + %op))
;CHECK: (Low: %src High: (32 + %src))

define void @fastCopy(i8* nocapture readonly %src, i8* nocapture %op) {
entry:
br label %while.body.preheader

while.body.preheader: ; preds = %entry
br label %while.body

while.body: ; preds = %while.body.preheader, %while.body
%len.addr.07 = phi i32 [ %sub, %while.body ], [ 32, %while.body.preheader ]
%op.addr.06 = phi i8* [ %add.ptr1, %while.body ], [ %op, %while.body.preheader ]
%src.addr.05 = phi i8* [ %add.ptr, %while.body ], [ %src, %while.body.preheader ]
%0 = bitcast i8* %src.addr.05 to i64*
%1 = load i64, i64* %0, align 8
%2 = bitcast i8* %op.addr.06 to i64*
store i64 %1, i64* %2, align 8
%add.ptr = getelementptr inbounds i8, i8* %src.addr.05, i64 8
%add.ptr1 = getelementptr inbounds i8, i8* %op.addr.06, i64 8
%sub = add nsw i32 %len.addr.07, -8
%cmp = icmp sgt i32 %len.addr.07, 8
br i1 %cmp, label %while.body, label %while.end.loopexit

while.end.loopexit: ; preds = %while.body
br label %while.end

while.end: ; preds = %while.end.loopexit, %entry
ret void
}
18 changes: 9 additions & 9 deletions llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
Expand Up @@ -96,15 +96,15 @@ for.end: ; preds = %for.body
; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group {{.*}}[[ZERO]]:
; CHECK-NEXT: (Low: %c High: (78 + %c))
; CHECK-NEXT: (Low: %c High: (80 + %c))
; CHECK-NEXT: Member: {(2 + %c)<nsw>,+,4}
; CHECK-NEXT: Member: {%c,+,4}
; CHECK-NEXT: Group {{.*}}[[ONE]]:
; CHECK-NEXT: (Low: %a High: (40 + %a))
; CHECK-NEXT: (Low: %a High: (42 + %a))
; CHECK-NEXT: Member: {(2 + %a)<nsw>,+,2}
; CHECK-NEXT: Member: {%a,+,2}
; CHECK-NEXT: Group {{.*}}[[TWO]]:
; CHECK-NEXT: (Low: %b High: (38 + %b))
; CHECK-NEXT: (Low: %b High: (40 + %b))
; CHECK-NEXT: Member: {%b,+,2}

define void @testg(i16* %a,
Expand Down Expand Up @@ -168,15 +168,15 @@ for.end: ; preds = %for.body
; CHECK-NEXT: %arrayidxB = getelementptr i16, i16* %b, i64 %ind
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group {{.*}}[[ZERO]]:
; CHECK-NEXT: (Low: %c High: (78 + %c))
; CHECK-NEXT: (Low: %c High: (80 + %c))
; CHECK-NEXT: Member: {(2 + %c)<nsw>,+,4}
; CHECK-NEXT: Member: {%c,+,4}
; CHECK-NEXT: Group {{.*}}[[ONE]]:
; CHECK-NEXT: (Low: %a High: (40 + %a))
; CHECK-NEXT: (Low: %a High: (42 + %a))
; CHECK-NEXT: Member: {(2 + %a),+,2}
; CHECK-NEXT: Member: {%a,+,2}
; CHECK-NEXT: Group {{.*}}[[TWO]]:
; CHECK-NEXT: (Low: %b High: (38 + %b))
; CHECK-NEXT: (Low: %b High: (40 + %b))
; CHECK-NEXT: Member: {%b,+,2}

define void @testh(i16* %a,
Expand Down Expand Up @@ -247,13 +247,13 @@ for.end: ; preds = %for.body
; CHECK-NEXT: %arrayidxA2 = getelementptr i16, i16* %a, i64 %ind2
; CHECK-NEXT: Grouped accesses:
; CHECK-NEXT: Group {{.*}}[[ZERO]]:
; CHECK-NEXT: (Low: ((2 * %offset) + %a)<nsw> High: (9998 + (2 * %offset) + %a))
; CHECK-NEXT: (Low: ((2 * %offset) + %a)<nsw> High: (10000 + (2 * %offset) + %a))
; CHECK-NEXT: Member: {((2 * %offset) + %a)<nsw>,+,2}<nsw><%for.body>
; CHECK-NEXT: Group {{.*}}[[ONE]]:
; CHECK-NEXT: (Low: %a High: (9998 + %a))
; CHECK-NEXT: (Low: %a High: (10000 + %a))
; CHECK-NEXT: Member: {%a,+,2}<%for.body>
; CHECK-NEXT: Group {{.*}}[[TWO]]:
; CHECK-NEXT: (Low: (20000 + %a) High: (29998 + %a))
; CHECK-NEXT: (Low: (20000 + %a) High: (30000 + %a))
; CHECK-NEXT: Member: {(20000 + %a),+,2}<%for.body>

define void @testi(i16* %a,
Expand Down
Expand Up @@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnueabi"

; CHECK: function 'f':
; CHECK: (Low: (20000 + %a) High: (60000 + %a)<nsw>)
; CHECK: (Low: (20000 + %a) High: (60004 + %a))

@B = common global i32* null, align 8
@A = common global i32* null, align 8
Expand Down Expand Up @@ -59,7 +59,7 @@ for.end: ; preds = %for.body
; Here it is not obvious what the limits are, since 'step' could be negative.

; CHECK: Low: (-1 + (-1 * ((-60001 + (-1 * %a)) umax (-60001 + (40000 * %step) + (-1 * %a)))))
; CHECK: High: ((60000 + %a)<nsw> umax (60000 + (-40000 * %step) + %a))
; CHECK: High: (4 + ((60000 + %a)<nsw> umax (60000 + (-40000 * %step) + %a)))

define void @g(i64 %step) {
entry:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
Expand Up @@ -8,10 +8,10 @@ target triple = "x86_64-apple-macosx10.8.0"
;CHECK: br
;CHECK: getelementptr
;CHECK-DAG: getelementptr
;CHECK-DAG: icmp uge
;CHECK-DAG: icmp uge
;CHECK-DAG: icmp uge
;CHECK-DAG: icmp uge
;CHECK-DAG: icmp ugt
;CHECK-DAG: icmp ugt
;CHECK-DAG: icmp ugt
;CHECK-DAG: icmp ugt
;CHECK-DAG: and
;CHECK-DAG: and
;CHECK: br
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
Expand Up @@ -36,7 +36,7 @@ for.end: ; preds = %for.body
; CHECK: ret i32 0

; CHECK-NOTBAA-LABEL: @test1
; CHECK-NOTBAA: icmp uge i32*
; CHECK-NOTBAA: icmp ugt i32*

; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
Expand Down Expand Up @@ -70,8 +70,8 @@ for.end: ; preds = %for.body
; required. Without TBAA, however, two checks are required.

; CHECK-LABEL: @test2
; CHECK: icmp uge float*
; CHECK: icmp uge float*
; CHECK: icmp ugt float*
; CHECK: icmp ugt float*
; CHECK-NOT: icmp uge i32*

; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
Expand All @@ -80,10 +80,10 @@ for.end: ; preds = %for.body
; CHECK: ret i32 0

; CHECK-NOTBAA-LABEL: @test2
; CHECK-NOTBAA: icmp uge float*
; CHECK-NOTBAA: icmp uge float*
; CHECK-NOTBAA-DAG: icmp uge float*
; CHECK-NOTBAA-DAG: icmp uge i32*
; CHECK-NOTBAA: icmp ugt float*
; CHECK-NOTBAA: icmp ugt float*
; CHECK-NOTBAA-DAG: icmp ugt float*
; CHECK-NOTBAA-DAG: icmp ugt i32*

; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
Expand Down
Expand Up @@ -8,15 +8,15 @@
; CHECK-NEXT: Loop Versioning found to be beneficial
;
; CHECK: for.body3:
; CHECK-NEXT: %add86 = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ]
; CHECK-NEXT: %[[induction:.*]] = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ]
; CHECK-NEXT: %j.113 = phi i32 [ %j.016, %for.body3.ph ], [ %inc, %for.body3 ]
; CHECK-NEXT: %idxprom = zext i32 %j.113 to i64
; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %var1, i64 %idxprom
; CHECK-NEXT: store i32 %add, i32* %arrayidx, align 4, !alias.scope !6, !noalias !6
; CHECK-NEXT: %add8 = add nsw i32 %add86, %add
; CHECK-NEXT: %add8 = add nsw i32 %[[induction]], %add
; CHECK-NEXT: %inc = add nuw i32 %j.113, 1
; CHECK-NEXT: %cmp2 = icmp ult i32 %inc, %itr
; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit5, !llvm.loop !7
; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit6, !llvm.loop !7
define i32 @foo(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 {
entry:
%cmp14 = icmp eq i32 %itr, 0
Expand Down

0 comments on commit 6f9836e

Please sign in to comment.