[LoopVectorize] Allow inner loop runtime checks to be hoisted above a…

…n outer loop Suppose we have a nested loop like this: void foo(int32_t *dst, int32_t *src, int m, int n) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { dst[(i * n) + j] += src[(i * n) + j]; } } } We currently generate runtime memory checks as a precondition for entering the vectorised version of the inner loop. However, if the runtime-determined trip count for the inner loop is quite small then the cost of these checks becomes quite expensive. This patch attempts to mitigate these costs by adding a new option to expand the memory ranges being checked to include the outer loop as well. This leads to runtime checks that can then be hoisted above the outer loop. For example, rather than looking for a conflict between the memory ranges: 1. &dst[(i * n)] -> &dst[(i * n) + n] 2. &src[(i * n)] -> &src[(i * n) + n] we can instead look at the expanded ranges: 1. &dst[0] -> &dst[((m - 1) * n) + n] 2. &src[0] -> &src[((m - 1) * n) + n] which are outer-loop-invariant. As with many optimisations there is a trade-off here, because there is a danger that using the expanded ranges we may never enter the vectorised inner loop, whereas with the smaller ranges we might enter at least once. I have added a HoistRuntimeChecks option that is turned off by default, but can be enabled for workloads where we know this is guaranteed to be of real benefit. In future, we can also use PGO to determine if this is worthwhile by using the inner loop trip count information. When enabling this option for SPEC2017 on neoverse-v1 with the flags "-Ofast -mcpu=native -flto" I see an overall geomean improvement of ~0.5%: SPEC2017 results (+ is an improvement, - is a regression): 520.omnetpp: +2% 525.x264: +2% 557.xz: +1.2% ... GEOMEAN: +0.5% I didn't investigate all the differences to see if they are genuine or noise, but I know the x264 improvement is real because it has some hot nested loops with low trip counts where I can see this hoisting is beneficial. Tests have been added here: Transforms/LoopVectorize/runtime-checks-hoist.ll Differential Revision: https://reviews.llvm.org/D152366
llvm · Aug 24, 2023 · c02184f · c02184f
1 parent d86a7d6
commit c02184f
Show file tree

Hide file tree

Showing 6 changed files with 426 additions and 297 deletions.
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -47,6 +47,13 @@ struct VectorizerParams {
   /// \When performing memory disambiguation checks at runtime do not
   /// make more than this number of comparisons.
   static unsigned RuntimeMemoryCheckThreshold;
+
+  // When creating runtime checks for nested loops, where possible try to
+  // write the checks in a form that allows them to be easily hoisted out of
+  // the outermost loop. For example, we can do this by expanding the range of
+  // addresses considered to include the entire nested loop so that they are
+  // loop invariant.
+  static bool HoistRuntimeChecks;
 };
 
 /// Checks memory dependences among accesses to the same underlying

diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -523,7 +523,7 @@ Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
 Value *
 addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
-                 SCEVExpander &Expander);
+                 SCEVExpander &Expander, bool HoistRuntimeChecks = false);
 
 Value *addDiffRuntimeChecks(
     Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -140,6 +140,13 @@ static cl::opt<bool> SpeculateUnitStride(
     cl::desc("Speculate that non-constant strides are unit in LAA"),
     cl::init(true));
 
+static cl::opt<bool, true> HoistRuntimeChecks(
+    "hoist-runtime-checks", cl::Hidden,
+    cl::desc(
+        "Hoist inner loop runtime memory checks to outer loop if possible"),
+    cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(false));
+bool VectorizerParams::HoistRuntimeChecks;
+
 bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
@@ -329,6 +336,29 @@ void RuntimePointerChecking::tryToCreateDiffCheck(
     CanUseDiffCheck = false;
     return;
   }
+
+  const Loop *InnerLoop = SrcAR->getLoop();
+  // If the start values for both Src and Sink also vary according to an outer
+  // loop, then it's probably better to avoid creating diff checks because
+  // they may not be hoisted. We should instead let llvm::addRuntimeChecks
+  // do the expanded full range overlap checks, which can be hoisted.
+  if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
+      isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) {
+    auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt);
+    auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt);
+    const Loop *StartARLoop = SrcStartAR->getLoop();
+    if (StartARLoop == SinkStartAR->getLoop() &&
+        StartARLoop == InnerLoop->getParentLoop()) {
+      LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
+                           "cannot be hoisted out of the outer loop\n");
+      CanUseDiffCheck = false;
+      return;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
+                    << "SrcStart: " << *SrcStartInt << '\n'
+                    << "SinkStartInt: " << *SinkStartInt << '\n');
   DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
                           Src->NeedsFreeze || Sink->NeedsFreeze);
 }

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1628,42 +1628,92 @@ Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
 struct PointerBounds {
   TrackingVH<Value> Start;
   TrackingVH<Value> End;
+  Value *StrideToCheck;
 };
 
 /// Expand code for the lower and upper bound of the pointer group \p CG
 /// in \p TheLoop.  \return the values for the bounds.
 static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
                                   Loop *TheLoop, Instruction *Loc,
-                                  SCEVExpander &Exp) {
+                                  SCEVExpander &Exp, bool HoistRuntimeChecks) {
   LLVMContext &Ctx = Loc->getContext();
   Type *PtrArithTy = PointerType::get(Ctx, CG->AddressSpace);
 
   Value *Start = nullptr, *End = nullptr;
   LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
-  Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
-  End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+  const SCEV *Low = CG->Low, *High = CG->High, *Stride = nullptr;
+
+  // If the Low and High values are themselves loop-variant, then we may want
+  // to expand the range to include those covered by the outer loop as well.
+  // There is a trade-off here with the advantage being that creating checks
+  // using the expanded range permits the runtime memory checks to be hoisted
+  // out of the outer loop. This reduces the cost of entering the inner loop,
+  // which can be significant for low trip counts. The disadvantage is that
+  // there is a chance we may now never enter the vectorized inner loop,
+  // whereas using a restricted range check could have allowed us to enter at
+  // least once. This is why the behaviour is not currently the default and is
+  // controlled by the parameter 'HoistRuntimeChecks'.
+  if (HoistRuntimeChecks && TheLoop->getParentLoop() &&
+      isa<SCEVAddRecExpr>(High) && isa<SCEVAddRecExpr>(Low)) {
+    auto *HighAR = cast<SCEVAddRecExpr>(High);
+    auto *LowAR = cast<SCEVAddRecExpr>(Low);
+    const Loop *OuterLoop = TheLoop->getParentLoop();
+    const SCEV *Recur = LowAR->getStepRecurrence(*Exp.getSE());
+    if (Recur == HighAR->getStepRecurrence(*Exp.getSE()) &&
+        HighAR->getLoop() == OuterLoop && LowAR->getLoop() == OuterLoop) {
+      BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+      const SCEV *OuterExitCount =
+          Exp.getSE()->getExitCount(OuterLoop, OuterLoopLatch);
+      if (!isa<SCEVCouldNotCompute>(OuterExitCount) &&
+          OuterExitCount->getType()->isIntegerTy()) {
+        const SCEV *NewHigh = cast<SCEVAddRecExpr>(High)->evaluateAtIteration(
+            OuterExitCount, *Exp.getSE());
+        if (!isa<SCEVCouldNotCompute>(NewHigh)) {
+          LLVM_DEBUG(dbgs() << "LAA: Expanded RT check for range to include "
+                               "outer loop in order to permit hoisting\n");
+          High = NewHigh;
+          Low = cast<SCEVAddRecExpr>(Low)->getStart();
+          // If there is a possibility that the stride is negative then we have
+          // to generate extra checks to ensure the stride is positive.
+          if (!Exp.getSE()->isKnownNonNegative(Recur)) {
+            Stride = Recur;
+            LLVM_DEBUG(dbgs() << "LAA: ... but need to check stride is "
+                                 "positive: "
+                              << *Stride << '\n');
+          }
+        }
+      }
+    }
+  }
+
+  Start = Exp.expandCodeFor(Low, PtrArithTy, Loc);
+  End = Exp.expandCodeFor(High, PtrArithTy, Loc);
   if (CG->NeedsFreeze) {
     IRBuilder<> Builder(Loc);
     Start = Builder.CreateFreeze(Start, Start->getName() + ".fr");
     End = Builder.CreateFreeze(End, End->getName() + ".fr");
   }
-  LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n");
-  return {Start, End};
+  Value *StrideVal =
+      Stride ? Exp.expandCodeFor(Stride, Type::getInt64Ty(Ctx), Loc) : nullptr;
+  LLVM_DEBUG(dbgs() << "Start: " << *Low << " End: " << *High << "\n");
+  return {Start, End, StrideVal};
 }
 
 /// Turns a collection of checks into a collection of expanded upper and
 /// lower bounds for both pointers in the check.
 static SmallVector<std::pair<PointerBounds, PointerBounds>, 4>
 expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
-             Instruction *Loc, SCEVExpander &Exp) {
+             Instruction *Loc, SCEVExpander &Exp, bool HoistRuntimeChecks) {
   SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
 
   // Here we're relying on the SCEV Expander's cache to only emit code for the
   // same bounds once.
   transform(PointerChecks, std::back_inserter(ChecksWithBounds),
             [&](const RuntimePointerCheck &Check) {
-              PointerBounds First = expandBounds(Check.first, L, Loc, Exp),
-                            Second = expandBounds(Check.second, L, Loc, Exp);
+              PointerBounds First = expandBounds(Check.first, L, Loc, Exp,
+                                                 HoistRuntimeChecks),
+                            Second = expandBounds(Check.second, L, Loc, Exp,
+                                                  HoistRuntimeChecks);
               return std::make_pair(First, Second);
             });
 
@@ -1673,10 +1723,11 @@ expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
 Value *llvm::addRuntimeChecks(
     Instruction *Loc, Loop *TheLoop,
     const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
-    SCEVExpander &Exp) {
+    SCEVExpander &Exp, bool HoistRuntimeChecks) {
   // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible.
   // TODO: Pass  RtPtrChecking instead of PointerChecks and SE separately, if possible
-  auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp);
+  auto ExpandedChecks =
+      expandBounds(PointerChecks, TheLoop, Loc, Exp, HoistRuntimeChecks);
 
   LLVMContext &Ctx = Loc->getContext();
   IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
@@ -1707,6 +1758,18 @@ Value *llvm::addRuntimeChecks(
     Value *Cmp0 = ChkBuilder.CreateICmpULT(A.Start, B.End, "bound0");
     Value *Cmp1 = ChkBuilder.CreateICmpULT(B.Start, A.End, "bound1");
     Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+    if (A.StrideToCheck) {
+      Value *IsNegativeStride = ChkBuilder.CreateICmpSLT(
+          A.StrideToCheck, ConstantInt::get(A.StrideToCheck->getType(), 0),
+          "stride.check");
+      IsConflict = ChkBuilder.CreateOr(IsConflict, IsNegativeStride);
+    }
+    if (B.StrideToCheck) {
+      Value *IsNegativeStride = ChkBuilder.CreateICmpSLT(
+          B.StrideToCheck, ConstantInt::get(B.StrideToCheck->getType(), 0),
+          "stride.check");
+      IsConflict = ChkBuilder.CreateOr(IsConflict, IsNegativeStride);
+    }
     if (MemoryRuntimeCheck) {
       IsConflict =
           ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1974,9 +1974,9 @@ class GeneratedRTChecks {
             },
             IC);
       } else {
-        MemRuntimeCheckCond =
-            addRuntimeChecks(MemCheckBlock->getTerminator(), L,
-                             RtPtrChecking.getChecks(), MemCheckExp);
+        MemRuntimeCheckCond = addRuntimeChecks(
+            MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
+            MemCheckExp, VectorizerParams::HoistRuntimeChecks);
       }
       assert(MemRuntimeCheckCond &&
              "no RT checks generated although RtPtrChecking "