Skip to content

Commit

Permalink
[LV] Avoid vectorizing loops under opt for size that involve SCEV checks
Browse files Browse the repository at this point in the history
Fix PR39417, PR39497

The loop vectorizer may generate runtime SCEV checks for overflow and stride==1
cases, leading to execution of original scalar loop. The latter is forbidden
when optimizing for size. An assert introduced in r344743 triggered the above
PR's showing it does happen. This patch fixes this behavior by preventing
vectorization in such cases.

Differential Revision: https://reviews.llvm.org/D53612

llvm-svn: 345959
  • Loading branch information
azaks committed Nov 2, 2018
1 parent 5497f65 commit 45a3ca7
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 1 deletion.
26 changes: 25 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -2557,7 +2557,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;

assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail");
assert(!Cost->foldTailByMasking() &&
"Cannot SCEV check stride or overflow when folding tail");
// Create a new block containing the stride check.
BB->setName("vector.scevcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
Expand Down Expand Up @@ -4637,6 +4638,29 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}

if (!PSE.getUnionPredicate().getPredicates().empty()) {
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
<< "runtime SCEV checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
"compiling with -Os/-Oz");
LLVM_DEBUG(
dbgs()
<< "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
return None;
}

// FIXME: Avoid specializing for stride==1 instead of bailing out.
if (!Legal->getLAI()->getSymbolicStrides().empty()) {
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
<< "runtime stride == 1 checks needed. Enable vectorization of "
"this loop with '#pragma clang loop vectorize(enable)' when "
"compiling with -Os/-Oz");
LLVM_DEBUG(
dbgs()
<< "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
return None;
}

// If we optimize the program for size, avoid creating the tail loop.
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

Expand Down
60 changes: 60 additions & 0 deletions llvm/test/Transforms/LoopVectorize/X86/optsize.ll
Expand Up @@ -3,6 +3,7 @@
; will produce a tail loop with the optimize for size or the minimize size
; attributes. This is a target-dependent version of the test.
; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF

target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"

Expand Down Expand Up @@ -136,3 +137,62 @@ for.end: ; preds = %for.body

attributes #1 = { minsize }


; We can't vectorize this one because we version for stride==1; even having TC
; a multiple of VF.
; CHECK-LABEL: @scev4stride1
; CHECK-NOT: vector.scevcheck
; CHECK-NOT: vector.body:
; CHECK-LABEL: for.body:
; AUTOVF-LABEL: @scev4stride1
; AUTOVF-NOT: vector.scevcheck
; AUTOVF-NOT: vector.body:
; AUTOVF-LABEL: for.body:
define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
for.body.preheader:
br label %for.body

for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%mul = mul nsw i32 %i.07, %k
%arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
store i32 %0, i32* %arrayidx1, align 4
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, 256
br i1 %exitcond, label %for.end.loopexit, label %for.body

for.end.loopexit: ; preds = %for.body
ret void
}

attributes #2 = { optsize }


; PR39497
; We can't vectorize this one because we version for overflow check and tiny
; trip count leads to opt-for-size (which otherwise could fold the tail by
; masking).
; CHECK-LABEL: @main
; CHECK-NOT: vector.scevcheck
; CHECK-NOT: vector.body:
; CHECK-LABEL: for.cond:
; AUTOVF-LABEL: @main
; AUTOVF-NOT: vector.scevcheck
; AUTOVF-NOT: vector.body:
; AUTOVF-LABEL: for.cond:
define i32 @main() local_unnamed_addr {
while.cond:
br label %for.cond

for.cond:
%d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
%conv = and i32 %d.0, 65535
%cmp = icmp ult i32 %conv, 4
%add = add nuw nsw i32 %conv, 1
br i1 %cmp, label %for.cond, label %while.cond.loopexit

while.cond.loopexit:
ret i32 0
}
54 changes: 54 additions & 0 deletions llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -0,0 +1,54 @@
; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

; PR39417
; Check that the need for overflow check prevents vectorizing a loop with tiny
; trip count (which implies opt for size).
; CHECK-LABEL: @func_34
; CHECK-NOT: vector.scevcheck
; CHECK-NOT: vector.body:
; CHECK-LABEL: bb67:
define void @func_34() {
bb1:
br label %bb67

bb67:
%storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ]
%sext = shl i32 %storemerge2, 16
%_tmp2299 = ashr exact i32 %sext, 16
%_tmp2300 = add nsw i32 %_tmp2299, 1
%_tmp2310 = trunc i32 %_tmp2300 to i16
%_tmp2312 = icmp slt i16 %_tmp2310, 3
br i1 %_tmp2312, label %bb67, label %bb68

bb68:
ret void
}

; Check that the need for stride==1 check prevents vectorizing a loop under opt
; for size.
; CHECK-LABEL: @scev4stride1
; CHECK-NOT: vector.scevcheck
; CHECK-NOT: vector.body:
; CHECK-LABEL: for.body:
define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 {
for.body.preheader:
br label %for.body

for.body:
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%mul = mul nsw i32 %i.07, %k
%arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
store i32 %0, i32* %arrayidx1, align 4
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.end.loopexit, label %for.body

for.end.loopexit:
ret void
}

attributes #0 = { optsize }

0 comments on commit 45a3ca7

Please sign in to comment.