[LoopFlatten] Use loop versioning when overflow can't be disproven #78576

john-brawn-arm · 2024-01-18T13:41:34Z

Implement the TODO in loop flattening to version the loop when we can't prove that the trip count calculation won't overflow.

llvmbot · 2024-01-18T13:42:09Z

@llvm/pr-subscribers-llvm-transforms

Author: John Brawn (john-brawn-arm)

Changes

Implement the TODO in loop flattening to version the loop when we can't prove that the trip count calculation won't overflow.

Patch is 34.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78576.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Scalar/LoopFlatten.cpp (+60-13)
(modified) llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll (-114)
(added) llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll (+418)
(modified) llvm/test/Transforms/LoopFlatten/widen-iv.ll (+2-2)

diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 533cefaf106133..697304f513a085 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -70,6 +70,7 @@
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include <optional>
@@ -97,6 +98,10 @@ static cl::opt<bool>
             cl::desc("Widen the loop induction variables, if possible, so "
                      "overflow checks won't reject flattening"));
 
+static cl::opt<bool>
+    VersionLoops("loop-flatten-version-loops", cl::Hidden, cl::init(true),
+                 cl::desc("Version loops if flattened loop could overflow"));
+
 namespace {
 // We require all uses of both induction variables to match this pattern:
 //
@@ -141,6 +146,8 @@ struct FlattenInfo {
                                               // has been applied. Used to skip
                                               // checks on phi nodes.
 
+  Value *NewTripCount = nullptr; // The tripcount of the flattened loop.
+
   FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){};
 
   bool isNarrowInductionPhi(PHINode *Phi) {
@@ -752,11 +759,13 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
     ORE.emit(Remark);
   }
 
-  Value *NewTripCount = BinaryOperator::CreateMul(
-      FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount",
-      FI.OuterLoop->getLoopPreheader()->getTerminator());
-  LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
-             NewTripCount->dump());
+  if (!FI.NewTripCount) {
+    FI.NewTripCount = BinaryOperator::CreateMul(
+        FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount",
+        FI.OuterLoop->getLoopPreheader()->getTerminator());
+    LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
+               FI.NewTripCount->dump());
+  }
 
   // Fix up PHI nodes that take values from the inner loop back-edge, which
   // we are about to remove.
@@ -769,7 +778,7 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 
   // Modify the trip count of the outer loop to be the product of the two
   // trip counts.
-  cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount);
+  cast<User>(FI.OuterBranch->getCondition())->setOperand(1, FI.NewTripCount);
 
   // Replace the inner loop backedge with an unconditional branch to the exit.
   BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
@@ -891,7 +900,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
                             ScalarEvolution *SE, AssumptionCache *AC,
                             const TargetTransformInfo *TTI, LPMUpdater *U,
-                            MemorySSAUpdater *MSSAU) {
+                            MemorySSAUpdater *MSSAU,
+                            const LoopAccessInfo &LAI) {
   LLVM_DEBUG(
       dbgs() << "Loop flattening running on outer loop "
              << FI.OuterLoop->getHeader()->getName() << " and inner loop "
@@ -926,18 +936,53 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   // variable might overflow. In this case, we need to version the loop, and
   // select the original version at runtime if the iteration space is too
   // large.
-  // TODO: We currently don't version the loop.
   OverflowResult OR = checkOverflow(FI, DT, AC);
   if (OR == OverflowResult::AlwaysOverflowsHigh ||
       OR == OverflowResult::AlwaysOverflowsLow) {
     LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
     return false;
   } else if (OR == OverflowResult::MayOverflow) {
-    LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
-    return false;
+    Module *M = FI.OuterLoop->getHeader()->getParent()->getParent();
+    const DataLayout &DL = M->getDataLayout();
+    if (!VersionLoops) {
+      LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+      return false;
+    } else if (!DL.isLegalInteger(
+                   FI.OuterTripCount->getType()->getScalarSizeInBits())) {
+      // If the trip count type isn't legal then it won't be possible to check
+      // for overflow using only a single multiply instruction, so don't
+      // flatten.
+      LLVM_DEBUG(
+          dbgs() << "Can't check overflow efficiently, not flattening\n");
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "Multiply might overflow, versioning loop\n");
+
+    // Version the loop. The overflow check isn't a runtime pointer check, so we
+    // pass an empty list of runtime pointer checks and add our own check
+    // afterwards.
+    BasicBlock *CheckBlock = FI.OuterLoop->getLoopPreheader();
+    ArrayRef<RuntimePointerCheck> Checks(nullptr, nullptr);
+    LoopVersioning LVer(LAI, Checks, FI.OuterLoop, LI, DT, SE);
+    LVer.versionLoop();
+
+    // Check for overflow by calculating the new tripcount using
+    // umul_with_overflow and then checking if it overflowed.
+    BranchInst *Br = cast<BranchInst>(CheckBlock->getTerminator());
+    assert(Br->isConditional() &&
+           "Expected LoopVersioning to generate a conditional branch");
+    IRBuilder<> Builder(Br);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow,
+                                            FI.OuterTripCount->getType());
+    Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount},
+                                     "flatten.mul");
+    FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount");
+    Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow");
+    Br->setCondition(Overflow);
+  } else {
+    LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
   }
 
-  LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
   return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
 }
 
@@ -958,13 +1003,15 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
+  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, nullptr);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
       continue;
     FlattenInfo FI(OuterLoop, InnerLoop);
-    Changed |= FlattenLoopPair(FI, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
-                               MSSAU ? &*MSSAU : nullptr);
+    Changed |=
+        FlattenLoopPair(FI, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
+                        MSSAU ? &*MSSAU : nullptr, LAIM.getInfo(*OuterLoop));
   }
 
   if (!Changed)
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
index 23ea09747cee74..479b5c3388f89c 100644
--- a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
@@ -568,72 +568,6 @@ for.cond.cleanup:
   ret void
 }
 
-; A 3d loop corresponding to:
-;
-;   for (int k = 0; k < N; ++k)
-;    for (int i = 0; i < N; ++i)
-;      for (int j = 0; j < M; ++j)
-;        f(&A[i*M+j]);
-;
-; This could be supported, but isn't at the moment.
-;
-define void @d3_2(i32* %A, i32 %N, i32 %M) {
-entry:
-  %cmp30 = icmp sgt i32 %N, 0
-  br i1 %cmp30, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
-
-for.cond1.preheader.lr.ph:
-  %cmp625 = icmp sgt i32 %M, 0
-  br label %for.cond1.preheader.us
-
-for.cond1.preheader.us:
-  %k.031.us = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ]
-  br i1 %cmp625, label %for.cond5.preheader.us.us.preheader, label %for.cond5.preheader.us43.preheader
-
-for.cond5.preheader.us43.preheader:
-  br label %for.cond1.for.cond.cleanup3_crit_edge.us.loopexit50
-
-for.cond5.preheader.us.us.preheader:
-  br label %for.cond5.preheader.us.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us.loopexit:
-  br label %for.cond1.for.cond.cleanup3_crit_edge.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us.loopexit50:
-  br label %for.cond1.for.cond.cleanup3_crit_edge.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us:
-  %inc13.us = add nuw nsw i32 %k.031.us, 1
-  %exitcond52 = icmp ne i32 %inc13.us, %N
-  br i1 %exitcond52, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
-
-for.cond5.preheader.us.us:
-  %i.028.us.us = phi i32 [ %inc10.us.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us ], [ 0, %for.cond5.preheader.us.us.preheader ]
-  %mul.us.us = mul nsw i32 %i.028.us.us, %M
-  br label %for.body8.us.us
-
-for.cond5.for.cond.cleanup7_crit_edge.us.us:
-  %inc10.us.us = add nuw nsw i32 %i.028.us.us, 1
-  %exitcond51 = icmp ne i32 %inc10.us.us, %N
-  br i1 %exitcond51, label %for.cond5.preheader.us.us, label %for.cond1.for.cond.cleanup3_crit_edge.us.loopexit
-
-for.body8.us.us:
-  %j.026.us.us = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %inc.us.us, %for.body8.us.us ]
-  %add.us.us = add nsw i32 %j.026.us.us, %mul.us.us
-  %idxprom.us.us = sext i32 %add.us.us to i64
-  %arrayidx.us.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us.us
-  tail call void @f(ptr %arrayidx.us.us) #2
-  %inc.us.us = add nuw nsw i32 %j.026.us.us, 1
-  %exitcond = icmp ne i32 %inc.us.us, %M
-  br i1 %exitcond, label %for.body8.us.us, label %for.cond5.for.cond.cleanup7_crit_edge.us.us
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
 ; A 3d loop corresponding to:
 ;
 ;   for (int i = 0; i < N; ++i)
@@ -785,54 +719,6 @@ for.empty:
   ret void
 }
 
-; GEP doesn't dominate the loop latch so can't guarantee N*M won't overflow.
-@first = global i32 1, align 4
-@a = external global [0 x i8], align 1
-define void @overflow(i32 %lim, ptr %a) {
-entry:
-  %cmp17.not = icmp eq i32 %lim, 0
-  br i1 %cmp17.not, label %for.cond.cleanup, label %for.cond1.preheader.preheader
-
-for.cond1.preheader.preheader:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %i.018 = phi i32 [ %inc6, %for.cond.cleanup3 ], [ 0, %for.cond1.preheader.preheader ]
-  %mul = mul i32 %i.018, 100000
-  br label %for.body4
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-
-for.cond.cleanup3:
-  %inc6 = add i32 %i.018, 1
-  %cmp = icmp ult i32 %inc6, %lim
-  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit
-
-for.body4:
-  %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %if.end ]
-  %add = add i32 %j.016, %mul
-  %0 = load i32, ptr @first, align 4
-  %tobool.not = icmp eq i32 %0, 0
-  br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:
-  %arrayidx = getelementptr inbounds [0 x i8], ptr @a, i32 0, i32 %add
-  %1 = load i8, ptr %arrayidx, align 1
-  tail call void asm sideeffect "", "r"(i8 %1)
-  store i32 0, ptr @first, align 4
-  br label %if.end
-
-if.end:
-  tail call void asm sideeffect "", "r"(i32 %add)
-  %inc = add nuw nsw i32 %j.016, 1
-  %cmp2 = icmp ult i32 %j.016, 99999
-  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
-}
-
 declare void @objc_enumerationMutation(ptr)
 declare dso_local void @f(ptr)
 declare dso_local void @g(...)
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll
new file mode 100644
index 00000000000000..dec323d135f35b
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll
@@ -0,0 +1,418 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes='loop(loop-flatten),verify' -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; We need to version the loop as the GEPs are not inbounds
+define void @noinbounds_gep(i32 %N, ptr %A) {
+; CHECK-LABEL: define void @noinbounds_gep(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 0, [[N]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INNER_PREHEADER_LVER_CHECK:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.inner.preheader.lver.check:
+; CHECK-NEXT:    [[FLATTEN_MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[N]], i32 [[N]])
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 0
+; CHECK-NEXT:    [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 1
+; CHECK-NEXT:    br i1 [[FLATTEN_OVERFLOW]], label [[FOR_INNER_PREHEADER_PH_LVER_ORIG:%.*]], label [[FOR_INNER_PREHEADER_PH:%.*]]
+; CHECK:       for.inner.preheader.ph.lver.orig:
+; CHECK-NEXT:    br label [[FOR_INNER_PREHEADER_LVER_ORIG:%.*]]
+; CHECK:       for.inner.preheader.lver.orig:
+; CHECK-NEXT:    [[I_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_PH_LVER_ORIG]] ], [ [[INC2_LVER_ORIG:%.*]], [[FOR_OUTER_LVER_ORIG:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_INNER_LVER_ORIG:%.*]]
+; CHECK:       for.inner.lver.orig:
+; CHECK-NEXT:    [[J_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_LVER_ORIG]] ], [ [[INC1_LVER_ORIG:%.*]], [[FOR_INNER_LVER_ORIG]] ]
+; CHECK-NEXT:    [[MUL_LVER_ORIG:%.*]] = mul i32 [[I_LVER_ORIG]], [[N]]
+; CHECK-NEXT:    [[GEP_LVER_ORIG:%.*]] = getelementptr i32, ptr [[A]], i32 [[MUL_LVER_ORIG]]
+; CHECK-NEXT:    [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr i32, ptr [[GEP_LVER_ORIG]], i32 [[J_LVER_ORIG]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[INC1_LVER_ORIG]] = add nuw i32 [[J_LVER_ORIG]], 1
+; CHECK-NEXT:    [[CMP2_LVER_ORIG:%.*]] = icmp ult i32 [[INC1_LVER_ORIG]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP2_LVER_ORIG]], label [[FOR_INNER_LVER_ORIG]], label [[FOR_OUTER_LVER_ORIG]]
+; CHECK:       for.outer.lver.orig:
+; CHECK-NEXT:    [[INC2_LVER_ORIG]] = add i32 [[I_LVER_ORIG]], 1
+; CHECK-NEXT:    [[CMP1_LVER_ORIG:%.*]] = icmp ult i32 [[INC2_LVER_ORIG]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP1_LVER_ORIG]], label [[FOR_INNER_PREHEADER_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]]
+; CHECK:       for.inner.preheader.ph:
+; CHECK-NEXT:    br label [[FOR_INNER_PREHEADER:%.*]]
+; CHECK:       for.inner.preheader:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_PH]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ]
+; CHECK-NEXT:    [[FLATTEN_ARRAYIDX:%.*]] = getelementptr i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
+; CHECK:       for.inner:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[N]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[A]], i32 [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i32 [[J]]
+; CHECK-NEXT:    store i32 0, ptr [[FLATTEN_ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC1:%.*]] = add nuw i32 [[J]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC1]], [[N]]
+; CHECK-NEXT:    br label [[FOR_OUTER]]
+; CHECK:       for.outer:
+; CHECK-NEXT:    [[INC2]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT_LOOPEXIT1:%.*]]
+; CHECK:       for.end.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit.loopexit1:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp3 = icmp ult i32 0, %N
+  br i1 %cmp3, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.inner.preheader
+
+for.inner.preheader:
+  %i = phi i32 [ 0, %for.outer.preheader ], [ %inc2, %for.outer ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.inner.preheader ], [ %inc1, %for.inner ]
+  %mul = mul i32 %i, %N
+  %gep = getelementptr i32, ptr %A, i32 %mul
+  %arrayidx = getelementptr i32, ptr %gep, i32 %j
+  store i32 0, ptr %arrayidx, align 4
+  %inc1 = add nuw i32 %j, 1
+  %cmp2 = icmp ult i32 %inc1, %N
+  br i1 %cmp2, label %for.inner, label %for.outer
+
+for.outer:
+  %inc2 = add i32 %i, 1
+  %cmp1 = icmp ult i32 %inc2, %N
+  br i1 %cmp1, label %for.inner.preheader, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; We shouldn't version the loop here as the multiply would use an illegal type.
+define void @noinbounds_gep_too_large_mul(i64 %N, ptr %A) {
+; CHECK-LABEL: define void @noinbounds_gep_too_large_mul(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 0, [[N]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.outer.preheader:
+; CHECK-NEXT:    br label [[FOR_INNER_PREHEADER:%.*]]
+; CHECK:       for.inner.preheader:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
+; CHECK:       for.inner:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER]] ], [ [[INC1:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[I]], [[N]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[A]], i64 [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i64 [[J]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC1]] = add nuw i64 [[J]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INC1]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_INNER]], label [[FOR_OUTER]]
+; CHECK:       for.outer:
+; CHECK-NEXT:    [[INC2]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[INC2]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp3 = icmp ult i64 0, %N
+  br i1 %cmp3, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.inner.preheader
+
+for.inner.preheader:
+  %i = phi i64 [ 0, %for.outer.preheader ], [ %inc2, %for.outer ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i64 [ 0, %for.inner.preheader ], [ %inc1, %for.inner ]
+  %mul = mul i64 %i, %N
+  %gep = getelementptr i32, ptr %A, i64 %mul
+  %arrayidx = getelementptr i32, ptr %gep, i64 %j
+  store i32 0, ptr %arrayidx, align 4
+  %inc1 = add nuw i64 %j, 1
+  %cmp2 = icmp ult i64 %inc1, %N
+  br i1 %cmp2, label %for.inner, label %for.outer
+
+for.outer:
+  %inc2 = add i64 %i, 1
+  %cmp1 = icmp ult i64 %inc2, %N
+  br i1 %cmp1, label %for.inner.preheader, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; A 3d loop corresponding to:
+;
+;   for (int k = 0; k < N; ++k)
+;    for (int i = 0; i < N; ++i)
+;      for (int j = 0; j < M; ++j)
+;        f(&A[i*M+j]);
+;
+define void @d3_2(i32* %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define void @d3_2(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP30]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP625:%.*]] = icmp sgt i32 [[M]], 0
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK:       for.cond1.preheader.us:
+; CHECK-NEXT:    [[K_031_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC13_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP625]], label [[FOR_COND5_PREHEADER_US_US_LVER_CHECK:%.*]], label [[FOR_COND5_PREHEADER_US43_PREHEADER:%.*]]
+; CHECK:       for.cond5.preheader.us43.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT50:%.*]]
+; CHECK:       for.cond5.prehead...
[truncated]

sjoerdmeijer · 2024-01-23T10:49:59Z

Looks like a very good addition.
Just a quick question. The pass is off by default, but have you benchmarked this? Just wanted to check if there are no unexpected results (e.g. the extra runtime checks regressing things).

john-brawn-arm · 2024-01-24T18:30:49Z

Looks like a very good addition. Just a quick question. The pass is off by default, but have you benchmarked this? Just wanted to check if there are no unexpected results (e.g. the extra runtime checks regressing things).

This improves several EEMBC benchmarks on cortex-m processors (or rather undoes the regression caused by e13bed4 canonicalizing in a way that removes the inbounds qualifier on gep instructions), and doesn't cause any noticeable regressions in those benchmarks.

Running the llvm-test-suite benchmarks Intel desktop this patch causes no significant changes to the results. In fact it has no effect when compiling for a 64-bit target at all, as then overflow is handled by widening the induction variable from 32 to 64 bit. Compiling with -m32 it does cause changes to benchmarks, but it's all in initialization code that contributes a negligible amount to the runtime.

sjoerdmeijer

LGTM

ostannard · 2024-01-25T11:08:27Z

llvm/lib/Transforms/Scalar/LoopFlatten.cpp

+    }
+    LLVM_DEBUG(dbgs() << "Multiply might overflow, versioning loop\n");
+
+    // Version the loop. The overflow check isn't a runtime pointer check, so we


There is an assertion in LoopVersioning::versionLoop to make sure that some runtime checks are emitted. There is also some code in there which generates checks using SCEV, I'm guessing that's what allows this to work? If so, could you expand this comment to explain why that is guaranteed?

I've expanded on the comment and also added an assert checking that the branch condition is false as expected.

ostannard

LGTM

[LoopFlatten] Use loop versioning when overflow can't be disproven

b73d4f9

Implement the TODO in loop flattening to version the loop when we can't prove that the trip count calculation won't overflow.

john-brawn-arm requested review from nikic, ostannard, sjoerdmeijer and RosieSumpter January 18, 2024 13:41

llvmbot added the llvm:transforms label Jan 18, 2024

john-brawn-arm mentioned this pull request Jan 18, 2024

[InstCombine] Retain inbounds when canonicalising add+gep #72244

Closed

sjoerdmeijer approved these changes Jan 25, 2024

View reviewed changes

ostannard reviewed Jan 25, 2024

View reviewed changes

Clarify that LoopVersioning is expected to generate a false condition

1f213bc

ostannard approved these changes Jan 25, 2024

View reviewed changes

john-brawn-arm merged commit a04d4a0 into llvm:main Jan 25, 2024
3 of 4 checks passed

This was referenced Apr 26, 2024

[InstCombine] Canonicalise SextADD + GEP #69581

Merged

missed optimization : gep + gep -> add + gep #78214

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[LoopFlatten] Use loop versioning when overflow can't be disproven #78576

[LoopFlatten] Use loop versioning when overflow can't be disproven #78576

john-brawn-arm commented Jan 18, 2024

llvmbot commented Jan 18, 2024

sjoerdmeijer commented Jan 23, 2024

john-brawn-arm commented Jan 24, 2024

sjoerdmeijer left a comment

ostannard Jan 25, 2024

john-brawn-arm Jan 25, 2024

ostannard left a comment

[LoopFlatten] Use loop versioning when overflow can't be disproven #78576

[LoopFlatten] Use loop versioning when overflow can't be disproven #78576

Conversation

john-brawn-arm commented Jan 18, 2024

llvmbot commented Jan 18, 2024

sjoerdmeijer commented Jan 23, 2024

john-brawn-arm commented Jan 24, 2024

sjoerdmeijer left a comment

Choose a reason for hiding this comment

ostannard Jan 25, 2024

Choose a reason for hiding this comment

john-brawn-arm Jan 25, 2024

Choose a reason for hiding this comment

ostannard left a comment

Choose a reason for hiding this comment