diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 2d2355d6be68a..7dfa21696201e 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -575,10 +575,13 @@ LLVM_ABI Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, /// Add code that checks at runtime if the accessed arrays in \p PointerChecks /// overlap. Returns the final comparator value or NULL if no check is needed. +/// If \p HoistRuntimeChecks and \p TheLoop has a parent, sets \p +/// AllChecksHoisted when all checks are outer-loop invariant (hoistable). LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl &PointerChecks, - SCEVExpander &Expander, bool HoistRuntimeChecks = false); + SCEVExpander &Expander, bool HoistRuntimeChecks, + bool &AllChecksHoisted); LLVM_ABI Value *addDiffRuntimeChecks( Instruction *Loc, ArrayRef Checks, SCEVExpander &Expander, diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba82288aeb4..41b0bb0b2b318 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -2038,13 +2038,16 @@ expandBounds(const SmallVectorImpl &PointerChecks, Loop *L, Value *llvm::addRuntimeChecks( Instruction *Loc, Loop *TheLoop, const SmallVectorImpl &PointerChecks, - SCEVExpander &Exp, bool HoistRuntimeChecks) { + SCEVExpander &Exp, bool HoistRuntimeChecks, bool &AllChecksHoisted) { // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp, HoistRuntimeChecks); LLVMContext &Ctx = Loc->getContext(); + auto *SE = Exp.getSE(); + auto *OuterLoop = TheLoop->getParentLoop(); + AllChecksHoisted = HoistRuntimeChecks && OuterLoop != nullptr; IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout())); ChkBuilder.SetInsertPoint(Loc); // Our instructions might fold to a constant. @@ -2083,6 +2086,20 @@ Value *llvm::addRuntimeChecks( "stride.check"); IsConflict = ChkBuilder.CreateOr(IsConflict, IsNegativeStride); } + + if (AllChecksHoisted) { + AllChecksHoisted &= SE->isLoopInvariant(SE->getSCEV(A.Start), OuterLoop); + AllChecksHoisted &= SE->isLoopInvariant(SE->getSCEV(B.Start), OuterLoop); + AllChecksHoisted &= SE->isLoopInvariant(SE->getSCEV(A.End), OuterLoop); + AllChecksHoisted &= SE->isLoopInvariant(SE->getSCEV(B.End), OuterLoop); + if (A.StrideToCheck) + AllChecksHoisted &= + SE->isLoopInvariant(SE->getSCEV(A.StrideToCheck), OuterLoop); + if (B.StrideToCheck) + AllChecksHoisted &= + SE->isLoopInvariant(SE->getSCEV(B.StrideToCheck), OuterLoop); + } + if (MemoryRuntimeCheck) { IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index ec2e6c1ab796b..a844d7847fa22 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -63,8 +63,10 @@ void LoopVersioning::versionLoop( SCEVExpander Exp2(*RtPtrChecking.getSE(), VersionedLoop->getHeader()->getDataLayout(), "induction"); - MemRuntimeCheck = addRuntimeChecks(RuntimeCheckBB->getTerminator(), - VersionedLoop, AliasChecks, Exp2); + bool AllChecksHoisted; + MemRuntimeCheck = + addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, + AliasChecks, Exp2, false, AllChecksHoisted); SCEVExpander Exp(*SE, RuntimeCheckBB->getDataLayout(), "scev.check"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca092dcfcb492..32a528c4328d0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1763,6 +1763,10 @@ class GeneratedRTChecks { /// If it is nullptr no memory runtime checks have been generated. Value *MemRuntimeCheckCond = nullptr; + /// True if memory checks are outer-loop invariant (hoistable). + /// Used to discount check cost for inner loops. + bool AllChecksHoisted = false; + DominatorTree *DT; LoopInfo *LI; TargetTransformInfo *TTI; @@ -1845,7 +1849,8 @@ class GeneratedRTChecks { } else { MemRuntimeCheckCond = addRuntimeChecks( MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), - MemCheckExp, VectorizerParams::HoistRuntimeChecks); + MemCheckExp, VectorizerParams::HoistRuntimeChecks, + AllChecksHoisted); } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " @@ -1928,13 +1933,11 @@ class GeneratedRTChecks { // the checks will likely be hoisted out and so the effective cost will // reduce according to the outer loop trip count. if (OuterLoop) { - ScalarEvolution *SE = MemCheckExp.getSE(); // TODO: If profitable, we could refine this further by analysing every // individual memory check, since there could be a mixture of loop // variant and invariant checks that mean the final condition is // variant. - const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); - if (SE->isLoopInvariant(Cond, OuterLoop)) { + if (AllChecksHoisted) { // It seems reasonable to assume that we can reduce the effective // cost of the checks even when we know nothing about the trip // count. Assume that the outer loop executes at least twice. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll index 611b980999bfe..636efe5a94056 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -250,6 +250,44 @@ outer.exit: ret void } +define void @outer_cannot_hoist(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, ptr nocapture noundef readonly %offsets, i64 noundef %n) { +; CHECK-LABEL: LV: Checking a loop in 'outer_cannot_hoist' +; CHECK: Calculating cost of runtime checks: +; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. +; CHECK: Total cost of runtime checks: 7 +; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8 +entry: + br label %outer.loop + +outer.loop: + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] + %offset.ptr = getelementptr inbounds i64, ptr %offsets, i64 %outer.iv + %offset = load i64, ptr %offset.ptr, align 8 + %0 = mul nsw i64 %offset, %n + br label %inner.loop + +inner.loop: + %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] + %1 = add nuw nsw i64 %iv.inner, %0 + %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1 + %2 = load i32, ptr %arrayidx.us, align 4 + %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1 + %3 = load i32, ptr %arrayidx8.us, align 4 + %add9.us = add nsw i32 %3, %2 + store i32 %add9.us, ptr %arrayidx8.us, align 4 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %inner.exit.cond = icmp eq i64 %iv.inner.next, %n + br i1 %inner.exit.cond, label %inner.exit, label %inner.loop + +inner.exit: + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %outer.exit.cond = icmp eq i64 %outer.iv.next, 3 + br i1 %outer.exit.cond, label %outer.exit, label %outer.loop + +outer.exit: + ret void +} + !0 = !{!"branch_weights", i32 10, i32 20} !1 = !{!"branch_weights", i32 1, i32 -1} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll index 9ace6be64b69a..e7b0db4269da9 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll @@ -20,8 +20,7 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP6]], i64 1000) ; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[SMAX6]], 4 ; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: [[SMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP6]], i64 1000) -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[SMAX8]], [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[SMAX6]], [[TMP0]] ; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] ; CHECK: [[OUTER_HEADER]]: ; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]