diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h index 0b0d4343a9fcb..2f2224891ab30 100644 --- a/llvm/include/llvm/Support/BranchProbability.h +++ b/llvm/include/llvm/Support/BranchProbability.h @@ -46,6 +46,7 @@ class BranchProbability { LLVM_ABI BranchProbability(uint32_t Numerator, uint32_t Denominator); bool isZero() const { return N == 0; } + bool isOne() const { return N == D; } bool isUnknown() const { return N == UnknownN; } static BranchProbability getZero() { return BranchProbability(0); } diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index ab35d217f0d93..2dbb92d3877e0 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -1004,7 +1004,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, }; // Fold branches for iterations where we know that they will exit or not - // exit. + // exit. In the case of an iteration's latch, if we thus find + // *OriginalLoopProb is incorrect, set ProbUpdateRequired to true. + bool ProbUpdateRequired = false; for (auto &Pair : ExitInfos) { ExitInfo &Info = Pair.second; for (unsigned i = 0, e = Info.ExitingBlocks.size(); i != e; ++i) { @@ -1029,6 +1031,14 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, continue; } + // For a latch, record any OriginalLoopProb contradiction. + if (!OriginalLoopProb.isUnknown() && IsLatch) { + BranchProbability ActualProb = *KnownWillExit + ? BranchProbability::getZero() + : BranchProbability::getOne(); + ProbUpdateRequired |= OriginalLoopProb != ActualProb; + } + SetDest(Info.ExitingBlocks[i], *KnownWillExit, Info.ExitOnTrue); } } @@ -1064,8 +1074,30 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, changeToUnreachable(Latches.back()->getTerminator(), PreserveLCSSA); } + // After merging adjacent blocks in Latches below: + // - CondLatches will list the blocks from Latches that are still terminated + // with conditional branches. + // - For 1 <= I < CondLatches.size(), IterCounts[I] will store the number of + // the original loop iterations through which control flows from + // CondLatches[I-1] to CondLatches[I]. + // - For I == 0 or I == CondLatches.size(), IterCounts[I] will store the + // number of the original loop iterations through which control can flow + // before CondLatches.front() or after CondLatches.back(), respectively, + // without taking the unrolled loop's backedge, if any. + // - CondLatchNexts[I] will store the CondLatches[I] branch target for the + // next of the original loop's iterations (as opposed to the exit target). + assert(ULO.Count == Latches.size() && + "Expected one latch block per unrolled iteration"); + std::vector IterCounts(1, 0); + std::vector CondLatches; + std::vector CondLatchNexts; + IterCounts.reserve(Latches.size() + 1); + CondLatches.reserve(Latches.size()); + CondLatchNexts.reserve(Latches.size()); + // Merge adjacent basic blocks, if possible. - for (BasicBlock *Latch : Latches) { + for (auto [I, Latch] : enumerate(Latches)) { + ++IterCounts.back(); assert((isa(Latch->getTerminator()) || (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) && "Need a branch as terminator, except when fully unrolling with " @@ -1081,6 +1113,10 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, llvm::replace(Latches, Dest, Fold); llvm::erase(UnrolledLoopBlocks, Dest); } + } else if (isa(Latch->getTerminator())) { + IterCounts.push_back(0); + CondLatches.push_back(Latch); + CondLatchNexts.push_back(Headers[(I + 1) % Latches.size()]); } } @@ -1167,6 +1203,10 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // each unrolled iteration's latch within it, we store the new trip count as // separate metadata. if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { + assert((CondLatches.size() == 1 && + (ProbUpdateRequired || OriginalLoopProb.isOne())) && + "Expected ULO.Runtime to give unrolled loop 1 conditional latch, " + "the backedge, requiring a probability update unless infinite"); // Where p is always the probability of executing at least 1 more // iteration, the probability for at least n more iterations is p^n. setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 68751cf95bc7a..4d7dee168f431 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -220,8 +220,8 @@ probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { // loop body might have unique blocks that execute a finite number of times // if, for example, the original loop body contains conditionals like i < // UnrollCount. - if (OriginalLoopProb == BranchProbability::getOne()) - return BranchProbability::getOne(); + if (OriginalLoopProb.isOne()) + return OriginalLoopProb; // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll new file mode 100644 index 0000000000000..fd7df00515e25 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll @@ -0,0 +1,530 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after complete loop unrolling. The final unrolled iteration +; unconditionally exits (backedge removed), and other unrolled iterations' +; latches might unconditionally continue. Either contradicts the original +; branch weights. +; +; (unroll-partial-unconditional-latch.ll tests partial unrolling cases, +; including cases where the latch of any iteration, including the final, might +; unconditionally continue.) +; +; For each case, we check: +; - Iteration frequencies +; - When each is multiplied by the number of original loop bodies that execute +; within it, they should sum to almost exactly the original loop body +; frequency. +; - The only exception is an impossibly high or low original frequency (e.g., +; due to bad profile data), for which there exist no new branch weights that +; can yield that frequency sum. In those cases, we expect the maximum or +; minimum possible frequency. +; - CFGs +; - We verify which branch weights go with which branches and that we did not +; overlook any other branch weights (no extra !prof or branch_weights). +; - We also check the number of original loop bodies (represented by a call to +; @f) that appear within each unrolled iteration. +; - Branch weight metadata +; - Checking frequencies already checks whether the branch weights have the +; expected effect, but we also want to check that we get uniform +; probabilities/weights (same !prof) across the unrolled iteration latches +; when expected. +; - llvm.loop.estimated_trip_count: +; - There should be none because loops are completely unrolled. + +; ------------------------------------------------------------------------------ +; Define LIT substitutions. +; +; Before using the following lit substitutions, sed should be called to replace +; these parameters in %s to produce %t.ll: +; - @I_0@ is the starting value for the original loop's induction variable. +; - @MIN@ and @MAX@ are the compile-time known minimum and maximum for the +; number of original loop iterations, regardless of @I_0@. +; - @W@ is the branch weight for the original loop's backedge. That value plus +; 1 is the original loop body frequency because the exit branch weight is 1. +; +; For verifying that the test code produces the original loop body frequency we +; expect. +; DEFINE: %{bf-fc} = opt %t.ll -S -passes='print' 2>&1 | \ +; DEFINE: FileCheck %s -check-prefixes +; +; For checking the unrolled loop. +; DEFINE: %{ur-bf} = opt %t.ll -S -passes='loop-unroll,print' 2>&1 +; DEFINE: %{fc} = FileCheck %s \ +; DEFINE: -implicit-check-not='llvm.loop.estimated_trip_count' \ +; DEFINE: -implicit-check-not='!prof' \ +; DEFINE: -implicit-check-not='branch_weights' \ +; DEFINE: -implicit-check-not='call void @f' -check-prefixes + +; ------------------------------------------------------------------------------ +; Check 1 max iteration: +; - Unroll count of >=1 should always produce complete unrolling. +; - That produces 0 unrolled iteration latches, so there are no branch weights +; to compute. +; +; Original loop body frequency is 2 (loop weight 1), which is impossibly high. +; +; RUN: sed -e s/@MAX@/1/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG1210 +; RUN: %{ur-bf} -unroll-count=1 | %{fc} UR1210 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR1210 +; +; The new do.body is less than the old do.body, which is impossibly high. +; ORIG1210: - do.body: float = 2.0, +; UR1210: - do.body: float = 1.0, +; +; UR1210: call void @f +; +; Original loop body frequency is 1 (loop weight 0). +; +; RUN: sed -e s/@MAX@/1/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG1110 +; RUN: %{ur-bf} -unroll-count=1 | %{fc} UR1110 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR1110 +; +; The the new do.body equals the old do.body. +; ORIG1110: - do.body: float = 1.0, +; UR1110: - do.body: float = 1.0, +; +; UR1110: call void @f + +; ------------------------------------------------------------------------------ +; Check 2 max iterations: +; - Unroll count of >=2 should always produce complete unrolling. +; - That produces <=1 unrolled iteration latch, so the implementation can +; compute uniform weights by solving, at worst, a linear equation. +; +; Original loop body frequency is 3 (loop weight 2), which is impossibly high. +; +; First use a variable iteration count so that the sole non-final unrolled +; iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/2/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG2310 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2310 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2310 +; +; The sum of the new do.body* cannot reach the old do.body, which is +; impossibly high. +; ORIG2310: - do.body: float = 3.0, +; UR2310: - do.body: float = 1.0, +; FIXME: Should be 1.0: +; UR2310: - do.body.1: float = 0.66667 +; +; The sole probability is maximized to try to reach the original frequency. +; UR2310: call void @f +; UR2310: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR2310: call void @f +; UR2310: br label %do.end +; FIXME: Should be (0, non-zero): +; UR2310: !0 = !{!"branch_weights", i32 1, i32 2} +; +; Now use a constant iteration count so that the sole non-final unrolled +; iteration's latch unconditionally continues. +; +; RUN: sed -e s/@MAX@/2/ -e s/@W@/2/ -e s/@MIN@/2/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG2320 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2320 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2320 +; +; The new do.body contains 2 of the original loop's iterations, so multiply +; it by 2, which is less than the old do.body, which is impossibly high. +; ORIG2320: - do.body: float = 3.0, +; UR2320: - do.body: float = 1.0, +; +; UR2320: call void @f +; UR2320-NOT: br +; UR2320: call void @f +; UR2320: ret void +; +; Original loop body frequency is 2 (loop weight 1). +; +; First use a variable iteration count so that the sole non-final unrolled +; iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/2/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG2210 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2210 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2210 +; +; The sum of the new do.body* is the old do.body. +; ORIG2210: - do.body: float = 2.0, +; UR2210: - do.body: float = 1.0, +; FIXME: Should be 1.0: +; UR2210: - do.body.1: float = 0.5, +; +; UR2210: call void @f +; UR2210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR2210: call void @f +; UR2210: br label %do.end +; FIXME: Should be (0, non-zero): +; UR2210: !0 = !{!"branch_weights", i32 1, i32 1} +; +; Now use a constant iteration count so that the sole non-final unrolled +; iteration's latch unconditionally continues. +; +; RUN: sed -e s/@MAX@/2/ -e s/@W@/1/ -e s/@MIN@/2/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG2220 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2220 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2220 +; +; The new do.body contains 2 of the original loop's iterations, so multiply +; it by 2 to get the old do.body. +; ORIG2220: - do.body: float = 2.0, +; UR2220: - do.body: float = 1.0, +; +; UR2220: call void @f +; UR2220-NOT: br +; UR2220: call void @f +; UR2220: ret void +; +; Original loop body frequency is 1 (loop weight 0). +; +; First use a variable iteration count so that the sole non-final unrolled +; iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/2/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG2110 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2110 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2110 +; +; The sum of the new do.body* is approximately the old do.body. +; ORIG2110: - do.body: float = 1.0, +; UR2110: - do.body: float = 1.0, +; UR2110: - do.body.1: float = 0.0{{(0000[0-9]*)?}}, +; +; UR2110: call void @f +; UR2110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR2110: call void @f +; UR2110: br label %do.end +; UR2110: !0 = !{!"branch_weights", i32 1, i32 0} +; +; Now use a constant iteration count so that the sole non-final unrolled +; iteration's latch unconditionally continues. +; +; RUN: sed -e s/@MAX@/2/ -e s/@W@/0/ -e s/@MIN@/2/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG2120 +; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2120 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2120 +; +; The new do.body contains 2 of the original loop's iterations, so multiply +; it by 2, which is greater than the old do.body, which is impossibly low. +; ORIG2120: - do.body: float = 1.0, +; UR2120: - do.body: float = 1.0, +; +; UR2120: call void @f +; UR2220-NOT: br +; UR2120: call void @f +; UR2120: ret void + +; ------------------------------------------------------------------------------ +; Check 3 max iterations: +; - Unroll count of >=3 should always produce complete unrolling. +; - That produces <=2 unrolled iteration latches, so the implementation can +; compute uniform weights solving, at worst, a quadratic equation. +; +; Original loop body frequency is 4 (loop weight 3), which is impossibly high. +; +; First use a variable iteration count so that all non-final unrolled +; iterations' latches remain conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3410 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3410 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3410 +; +; The sum of the new do.body* cannot reach the old do.body, which is +; impossibly high. +; ORIG3410: - do.body: float = 4.0, +; UR3410: - do.body: float = 1.0, +; FIXME: Should be 1.0: +; UR3410: - do.body.1: float = 0.75, +; FIXME: Should be 1.0: +; UR3410: - do.body.2: float = 0.5625, +; +; The probabilities are maximized to try to reach the original frequency. +; UR3410: call void @f +; UR3410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR3410: call void @f +; UR3410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0 +; UR3410: call void @f +; UR3410: br label %do.end +; FIXME: Should be (0, non-zero): +; UR3410: !0 = !{!"branch_weights", i32 1, i32 3} +; +; Now use a constant iteration count so that all non-final unrolled +; iterations' latches unconditionally continue. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/3/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3430 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3430 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3430 +; +; The new do.body contains 3 of the original loop's iterations, so multiply +; it by 3, which is less than the old do.body, which is impossibly high. +; ORIG3430: - do.body: float = 4.0, +; UR3430: - do.body: float = 1.0, +; +; UR3430: call void @f +; UR3430-NOT: br +; UR3430: call void @f +; UR3430-NOT: br +; UR3430: call void @f +; UR3430: ret void +; +; Use a constant iteration count but now the loop upper bound computation can +; overflow. When it does, the loop induction variable is greater than it +; immediately, so the initial unrolled iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/3/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll +; RUN: %{bf-fc} ORIG343x +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR343x +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR343x +; +; The new do.body.1 contains 2 of the original loop's iterations, so +; multiply it by 2, and add the new do.body, but that sum is less than the +; old do.body, which is impossibly high. +; ORIG343x: - do.body: float = 4.0, +; UR343x: - do.body: float = 1.0, +; FIXME: Should be 1.0: +; UR343x: - do.body.1: float = 0.75, +; +; The sole probability is maximized to try to reach the original frequency. +; UR343x: call void @f +; UR343x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR343x: call void @f +; UR343x-NOT: br +; UR343x: call void @f +; UR343x: ret void +; FIXME: Should be (0, non-zero): +; UR343x: !0 = !{!"branch_weights", i32 1, i32 3} +; +; Original loop body frequency is 3 (loop weight 2). +; +; First use a variable iteration count so that all non-final unrolled +; iterations' latches remain conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3310 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3310 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3310 +; +; The sum of the new do.body* is the old do.body. +; ORIG3310: - do.body: float = 3.0, +; UR3310: - do.body: float = 1.0, +; FIXME: Should be 1.0: +; UR3310: - do.body.1: float = 0.66667, +; FIXME: Should be 1.0: +; UR3310: - do.body.2: float = 0.44444, +; +; UR3310: call void @f +; UR3310: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR3310: call void @f +; UR3310: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0 +; UR3310: call void @f +; UR3310: br label %do.end +; FIXME: Should be (very small, very large): +; UR3310: !0 = !{!"branch_weights", i32 1, i32 2} +; +; Now use a constant iteration count so that all non-final unrolled +; iterations' latches unconditionally continue. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/2/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3330 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3330 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3330 +; +; The new do.body contains 3 of the original loop's iterations, so multiply +; it by 3 to get the old do.body. +; ORIG3330: - do.body: float = 3.0, +; UR3330: - do.body: float = 1.0, +; +; UR3330: call void @f +; UR3330-NOT: br +; UR3330: call void @f +; UR3330-NOT: br +; UR3330: call void @f +; UR3330: ret void +; +; Use a constant iteration count but now the loop upper bound computation can +; overflow. When it does, the loop induction variable is greater than it +; immediately, so the initial unrolled iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/2/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll +; RUN: %{bf-fc} ORIG333x +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR333x +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR333x +; +; The new do.body.1 contains 2 of the original loop's iterations, so +; multiply it by 2, and add the new do.body to get the old do.body. +; ORIG333x: - do.body: float = 3.0, +; UR333x: - do.body: float = 1.0, +; FIXME: Should be 1.0: +; UR333x: - do.body.1: float = 0.66667, +; +; UR333x: call void @f +; UR333x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR333x: call void @f +; UR333x-NOT: br +; UR333x: call void @f +; UR333x: br label %do.end +; FIXME: Should be (very small, very large): +; UR333x: !0 = !{!"branch_weights", i32 1, i32 2} +; +; Original loop body frequency is 2 (loop weight 1). This is our first case +; where new frequencies and probabilities are not all approximately 1 or 0. +; +; First use a variable iteration count so that all non-final unrolled +; iterations' latches remain conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3210 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3210 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3210 +; +; The sum of the new do.body* is the old do.body. +; ORIG3210: - do.body: float = 2.0, +; UR3210: - do.body: float = 1.0, +; FIXME: Should sum to 1.0: +; UR3210: - do.body.1: float = 0.5, +; UR3210: - do.body.2: float = 0.25, +; +; UR3210: call void @f +; UR3210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR3210: call void @f +; UR3210: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0 +; UR3210: call void @f +; UR3210: br label %do.end +; UR3210: !0 = !{!"branch_weights", i32 1, i32 1} +; +; Now use a constant iteration count so that all non-final unrolled +; iterations' latches unconditionally continue. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/1/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3230 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3230 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3230 +; +; The new do.body contains 3 of the original loop's iterations, so multiply +; it by 3, which is greater than the old do.body, which is impossibly low. +; ORIG3230: - do.body: float = 2.0, +; UR3230: - do.body: float = 1.0, +; +; UR3230: call void @f +; UR3230-NOT: br +; UR3230: call void @f +; UR3230-NOT: br +; UR3230: call void @f +; UR3230: ret void +; +; Use a constant iteration count but now the loop upper bound computation can +; overflow. When it does, the loop induction variable is greater than it +; immediately, so the initial unrolled iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/1/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll +; RUN: %{bf-fc} ORIG323x +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR323x +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR323x +; +; The new do.body.1 contains 2 of the original loop's iterations, so +; multiply it by 2, and add the new do.body to get the old do.body. +; ORIG323x: - do.body: float = 2.0, +; UR323x: - do.body: float = 1.0, +; UR323x: - do.body.1: float = 0.5, +; +; UR323x: call void @f +; UR323x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR323x: call void @f +; UR323x-NOT: br +; UR323x: call void @f +; UR323x: br label %do.end +; UR323x: !0 = !{!"branch_weights", i32 1, i32 1} +; +; Original loop body frequency is 1 (loop weight 0). +; +; First use a variable iteration count so that all non-final unrolled +; iterations' latches remain conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3110 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3110 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3110 +; +; The sum of the new do.body* is approximately the old do.body. +; ORIG3110: - do.body: float = 1.0, +; UR3110: - do.body: float = 1.0, +; UR3110: - do.body.1: float = 0.0{{(0000[0-9]*)?}}, +; UR3110: - do.body.2: float = 0.0{{(0000[0-9]*)?}}, +; +; UR3110: call void @f +; UR3110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR3110: call void @f +; UR3110: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0 +; UR3110: call void @f +; UR3110: br label %do.end +; UR3110: !0 = !{!"branch_weights", i32 1, i32 0} +; +; Now use a constant iteration count so that all non-final unrolled +; iterations' latches unconditionally continue. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/0/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll +; RUN: %{bf-fc} ORIG3130 +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3130 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3130 +; +; The new do.body contains 3 of the original loop's iterations, so multiply +; it by 3, which is greater than the old do.body, which is impossibly low. +; ORIG3130: - do.body: float = 1.0, +; UR3130: - do.body: float = 1.0, +; +; UR3130: call void @f +; UR3130-NOT: br +; UR3130: call void @f +; UR3130-NOT: br +; UR3130: call void @f +; UR3130: ret void +; +; Use a constant iteration count but now the loop upper bound computation can +; overflow. When it does, the loop induction variable is greater than it +; immediately, so the initial unrolled iteration's latch remains conditional. +; +; RUN: sed -e s/@MAX@/3/ -e s/@W@/0/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll +; RUN: %{bf-fc} ORIG313x +; RUN: %{ur-bf} -unroll-count=3 | %{fc} UR313x +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR313x +; +; The new do.body.1 contains 2 of the original loop's iterations, so +; multiply it by 2, and add the new do.body to get approximately the old +; do.body. +; ORIG313x: - do.body: float = 1.0, +; UR313x: - do.body: float = 1.0, +; UR313x: - do.body.1: float = 0.0{{(0000[0-9]*)?}}, +; +; UR313x: call void @f +; UR313x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0 +; UR313x: call void @f +; UR313x-NOT: br +; UR313x: call void @f +; UR313x: br label %do.end +; UR313x: !0 = !{!"branch_weights", i32 1, i32 0} + +declare void @f(i32) + +define void @test(i32 %x, i32 %n) { +entry: + %n.min = call i32 @llvm.umax.i32(i32 %n, i32 @MIN@) + %n.minmax = call i32 @llvm.umin.i32(i32 %n.min, i32 @MAX@) + %i_n = add i32 @I_0@, %n.minmax + br label %do.body + +do.body: + %i = phi i32 [ @I_0@, %entry ], [ %inc, %do.body ] + %inc = add i32 %i, 1 + call void @f(i32 %i) + %c = icmp uge i32 %inc, %i_n + br i1 %c, label %do.end, label %do.body, !prof !0 + +do.end: + ret void +} + +; Loop body frequency is @W@ + 1. +!0 = !{!"branch_weights", i32 1, i32 @W@} diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll index f5d05e666cabb..8b5a88bd6e8cd 100644 --- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll @@ -2,7 +2,14 @@ ; frequencies after loop unrolling with an epilogue. ; ; We check various interesting unroll count values relative to the original -; loop's body frequency of 11 (e.g., minimum and boundary values). +; loop's body frequency of 11, and we check when the epilogue loop itself is and +; is not unrolled. +; +; Without -unroll-remainder, the epilogue is unrolled only at -unroll-count=2 +; because there it has only 1 iteration and so is always completely unrolled. +; With -unroll-remainder, for some reason related to computing the remainder in +; two's complement, the epilogue is completely unrolled only when -unroll-count +; is a power of 2. ; ; For each case, we check: ; - Iteration frequencies @@ -14,13 +21,21 @@ ; overlook any other branch weights (no extra !prof or branch_weights). ; - We also check the number of original loop bodies (represented by a call to ; @f) that appear within each unrolled iteration. +; - Branch weight metadata +; - Checking frequencies already checks whether the branch weights have the +; expected effect, but we also want to check the following. +; - Whether the epilogue loop is unrolled should not affect the unrolled +; loop's estimated trip count or the branch weights on the unrolled loop +; guard, unrolled loop latch, or epilogue loop guard. +; - We get uniform probabilities/weights (same !prof) across the epilogue +; iteration latches when expected. ; - llvm.loop.estimated_trip_count -; - For the unrolled and epilogue loops, must be the number of iterations +; - For the unrolled and epilogue loops, it must be the number of iterations ; required for the original loop body to reach its original estimated trip ; count, which is its original frequency, 11, because there is no prior ; llvm.loop.estimated_trip_count. -; - Must not be blindly duplicated between the unrolled and epilogue loops. -; - Must not be blindly computed from any new latch branch weights. +; - It must not be blindly duplicated between the unrolled and epilogue loops. +; - It must not be blindly computed from any new latch branch weights. ; ------------------------------------------------------------------------------ ; Verify that the test code produces the original loop body frequency we expect. @@ -45,6 +60,7 @@ ; Check -unroll-count=2. ; ; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2 +; RUN: %{ur-bf} -unroll-count=2 -unroll-remainder | %{fc} UR2 ; ; Multiply do.body by 2 and add do.body.epil to get the original loop body ; frequency, 11. @@ -72,22 +88,36 @@ ; ------------------------------------------------------------------------------ ; Check -unroll-count=4. ; -; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4 +; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4,UR4-ELP +; RUN: %{ur-bf} -unroll-count=4 -unroll-remainder | %{fc} UR4,UR4-EUR ; -; Multiply do.body by 4 and add do.body.epil* to get the original loop body -; frequency, 11. -; UR4: - do.body: float = 2.3702, -; UR4: - do.body.epil: float = 1.5193, +; Multiply do.body by 4 and add do.body.epil* for either ELP or EUR to get the +; original loop body frequency, 11. +; UR4: - do.body: float = 2.3702, +; UR4-ELP: - do.body.epil: float = 1.5193, +; FIXME: Should sum to 1.5193: +; UR4-EUR: - do.body.epil: float = 0.78453, +; UR4-EUR: - do.body.epil.1: float = 0.37941, +; UR4-EUR: - do.body.epil.2: float = 0.18349, ; ; Unrolled loop guard, body, and latch. ; UR4: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0 ; UR4-COUNT-4: call void @f ; UR4: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2 ; -; Epilogue guard and loop. +; Epilogue guard. ; UR4: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5 -; UR4: call void @f -; UR4: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Non-unrolled epilogue loop. +; UR4-ELP: call void @f +; UR4-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Completely unrolled epilogue loop. +; UR4-EUR: call void @f +; UR4-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6 +; UR4-EUR: call void @f +; UR4-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6 +; UR4-EUR: call void @f ; ; Unrolled loop metadata. ; UR4: !0 = !{!"branch_weights", i32 534047398, i32 1613436250} @@ -97,30 +127,69 @@ ; UR4: !4 = !{!"llvm.loop.unroll.disable"} ; UR4: !5 = !{!"branch_weights", i32 1531603292, i32 615880356} ; -; Epilogue loop metadata. -; UR4: !6 = !{!"branch_weights", i32 1038564635, i32 1108919013} -; UR4: !7 = distinct !{!7, !8, !4} -; UR4: !8 = !{!"llvm.loop.estimated_trip_count", i32 3} +; Non-unrolled epilogue loop metadata. +; UR4-ELP: !6 = !{!"branch_weights", i32 1038564635, i32 1108919013} +; UR4-ELP: !7 = distinct !{!7, !8, !4} +; UR4-ELP: !8 = !{!"llvm.loop.estimated_trip_count", i32 3} +; +; Completely unrolled epilogue loop metadata. Because it loses its backedge: +; - The remaining conditional latches' branch weights must be adjusted relative +; to the non-unrolled case. There are only two, so the implementation can +; compute uniform branch weights using the quadratic formula. +; - It has no llvm.loop.estimated_trip_count. +; UR4-EUR: !6 = !{!"branch_weights", i32 1038564635, i32 1108919013} ; ------------------------------------------------------------------------------ ; Check -unroll-count=10. ; -; RUN: %{ur-bf} -unroll-count=10 | %{fc} UR10 -; -; Multiply do.body by 8 and add do.body.epil* to get the original loop body -; frequency, 11. -; UR10: - do.body: float = 0.6902, -; UR10: - do.body.epil: float = 4.098, +; RUN: %{ur-bf} -unroll-count=10 | %{fc} UR10,UR10-ELP +; RUN: %{ur-bf} -unroll-count=10 -unroll-remainder | %{fc} UR10,UR10-EUR +; +; Multiply do.body by 10 and add do.body.epil* for either ELP or EUR to get the +; original loop body frequency, 11. +; UR10: - do.body: float = 0.6902, +; UR10-ELP: - do.body.epil: float = 4.098, +; UR10-EUR: - do.body.epil: float = 1.0375, +; UR10-EUR: - do.body.epil.1: float = 0.80019, +; UR10-EUR: - do.body.epil.2: float = 0.61718, +; UR10-EUR: - do.body.epil.3: float = 0.47602, +; UR10-EUR: - do.body.epil.4: float = 0.36715, +; UR10-EUR: - do.body.epil.5: float = 0.28318, +; UR10-EUR: - do.body.epil.6: float = 0.21841, +; UR10-EUR: - do.body.epil.7: float = 0.16846, +; UR10-EUR: - do.body.epil.8: float = 0.12993, ; ; Unrolled loop guard, body, and latch. ; UR10: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0 ; UR10-COUNT-10: call void @f ; UR10: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2 ; -; Epilogue guard and loop. +; Epilogue guard. ; UR10: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5 -; UR10: call void @f -; UR10: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Non-unrolled epilogue loop. +; UR10-ELP: call void @f +; UR10-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Partially unrolled epilogue loop. +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.7, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.8, label %do.end.epilog-lcssa, !prof !6 +; UR10-EUR: call void @f +; UR10-EUR: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 ; ; Unrolled loop metadata. ; UR10: !0 = !{!"branch_weights", i32 1236740947, i32 910742701} @@ -130,30 +199,69 @@ ; UR10: !4 = !{!"llvm.loop.unroll.disable"} ; UR10: !5 = !{!"branch_weights", i32 1829762672, i32 317720976} ; -; Epilogue loop metadata. Its llvm.loop.estimated_trip_count happens to be the -; same as the unrolled loop's, so there's no new metadata node. -; UR10: !6 = !{!"branch_weights", i32 1656332913, i32 491150735} -; UR10: !7 = distinct !{!7, ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; The unrolled epilogue loop does not lose any conditional branches, so: +; - The non-unrolled epilogue branch weights are shared across them. +; - This is our first case where the unrolled epilogue loop has an +; llvm.loop.estimated_trip_count. However, it happens to be the same as the +; unrolled loop's, so there's no new metadata node. +; UR10: !6 = !{!"branch_weights", i32 1656332913, i32 491150735} +; UR10-ELP: !7 = distinct !{!7, !3, !4} +; UR10-EUR: !7 = distinct !{!7, !3} ; ------------------------------------------------------------------------------ ; Check -unroll-count=11. ; -; RUN: %{ur-bf} -unroll-count=11 | %{fc} UR11 -; -; Multiply do.body by 11 and add do.body.epil* to get the original loop body -; frequency, 11. -; UR11: - do.body: float = 0.59359, -; UR11: - do.body.epil: float = 4.4705, +; RUN: %{ur-bf} -unroll-count=11 | %{fc} UR11,UR11-ELP +; RUN: %{ur-bf} -unroll-count=11 -unroll-remainder | %{fc} UR11,UR11-EUR +; +; Multiply do.body by 11 and add do.body.epil* for either ELP or EUR to get the +; original loop body frequency, 11. +; UR11: - do.body: float = 0.59359, +; UR11-ELP: - do.body.epil: float = 4.4705, +; UR11-EUR: - do.body.epil: float = 1.0428, +; UR11-EUR: - do.body.epil.1: float = 0.82209, +; UR11-EUR: - do.body.epil.2: float = 0.64812, +; UR11-EUR: - do.body.epil.3: float = 0.51097, +; UR11-EUR: - do.body.epil.4: float = 0.40284, +; UR11-EUR: - do.body.epil.5: float = 0.31759, +; UR11-EUR: - do.body.epil.6: float = 0.25038, +; UR11-EUR: - do.body.epil.7: float = 0.1974, +; UR11-EUR: - do.body.epil.8: float = 0.15562, +; UR11-EUR: - do.body.epil.9: float = 0.12269, ; ; Unrolled loop guard, body, and latch. ; UR11: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0 ; UR11-COUNT-11: call void @f ; UR11: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2 - -; Epilogue guard and loop. +; +; Epilogue guard. ; UR11: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5 -; UR11: call void @f -; UR11: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Non-unrolled epilogue loop. +; UR11-ELP: call void @f +; UR11-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Partially unrolled epilogue loop. +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.7, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.8, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.9, label %do.end.epilog-lcssa, !prof !6 +; UR11-EUR: call void @f +; UR11-EUR: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 ; ; Unrolled loop metadata. ; UR11: !0 = !{!"branch_weights", i32 1319535738, i32 827947910} @@ -163,30 +271,74 @@ ; UR11: !4 = !{!"llvm.loop.unroll.disable"} ; UR11: !5 = !{!"branch_weights", i32 1846907894, i32 300575754} ; -; Epilogue loop metadata. -; UR11: !6 = !{!"branch_weights", i32 1693034047, i32 454449601} -; UR11: !7 = distinct !{!7, !8, !4} -; UR11: !8 = !{!"llvm.loop.estimated_trip_count", i32 0} +; The unrolled epilogue loop does not lose any conditional branches, so: +; - The non-unrolled epilogue branch weights are shared across them. +; - The unrolled epilogue loop has an llvm.loop.estimated_trip_count. This is +; our first case where it is different than the unrolled loop's, so it has its +; own metadata node. But it happens to be the same as the non-unrolled +; epilogue loop's. +; UR11: !6 = !{!"branch_weights", i32 1693034047, i32 454449601} +; UR11-ELP: !7 = distinct !{!7, !8, !4} +; UR11-EUR: !7 = distinct !{!7, !8} +; UR11: !8 = !{!"llvm.loop.estimated_trip_count", i32 0} ; ------------------------------------------------------------------------------ ; Check -unroll-count=12. ; -; RUN: %{ur-bf} -unroll-count=12 | %{fc} UR12 -; -; Multiply do.body by 12 and add do.body.epil* to get the original loop body -; frequency, 11. -; UR12: - do.body: float = 0.5144, -; UR12: - do.body.epil: float = 4.8272, +; RUN: %{ur-bf} -unroll-count=12 | %{fc} UR12,UR12-ELP +; RUN: %{ur-bf} -unroll-count=12 -unroll-remainder | %{fc} UR12,UR12-EUR +; +; Multiply do.body by 12 and add do.body.epil* for either ELP or EUR to get the +; original loop body frequency, 11. +; UR12: - do.body: float = 0.5144, +; UR12-ELP: - do.body.epil: float = 4.8272, +; UR12-EUR: - do.body.epil: float = 1.0463, +; UR12-EUR: - do.body.epil.1: float = 0.83968, +; UR12-EUR: - do.body.epil.2: float = 0.67387, +; UR12-EUR: - do.body.epil.3: float = 0.5408, +; UR12-EUR: - do.body.epil.4: float = 0.43401, +; UR12-EUR: - do.body.epil.5: float = 0.3483, +; UR12-EUR: - do.body.epil.6: float = 0.27952, +; UR12-EUR: - do.body.epil.7: float = 0.22433, +; UR12-EUR: - do.body.epil.8: float = 0.18003, +; UR12-EUR: - do.body.epil.9: float = 0.14448, +; UR12-EUR: - do.body.epil.10: float = 0.11595, ; ; Unrolled loop guard, body, and latch. ; UR12: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0 ; UR12-COUNT-12: call void @f ; UR12: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2 ; -; Epilogue guard and loop. +; Epilogue guard. ; UR12: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5 -; UR12: call void @f -; UR12: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Non-unrolled epilogue loop. +; UR12-ELP: call void @f +; UR12-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 +; +; Partially unrolled epilogue loop. +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.7, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.8, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.9, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.10, label %do.end.epilog-lcssa, !prof !6 +; UR12-EUR: call void @f +; UR12-EUR: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7 ; ; Unrolled loop metadata. ; UR12: !0 = !{!"branch_weights", i32 1394803730, i32 752679918} @@ -196,10 +348,16 @@ ; UR12: !4 = !{!"llvm.loop.unroll.disable"} ; UR12: !5 = !{!"branch_weights", i32 1860963812, i32 286519836} ; -; Epilogue loop metadata. -; UR12: !6 = !{!"branch_weights", i32 1723419551, i32 424064097} -; UR12: !7 = distinct !{!7, !8, !4} -; UR12: !8 = !{!"llvm.loop.estimated_trip_count", i32 11} +; The unrolled epilogue loop does not lose any conditional branches, so: +; - The non-unrolled epilogue branch weights are shared across them. +; - The unrolled epilogue loop has an llvm.loop.estimated_trip_count. This is +; our first case where it is different than both the unrolled loop's and the +; non-unrolled epilogue loop's, so they all have distinct metadata nodes. +; UR12: !6 = !{!"branch_weights", i32 1723419551, i32 424064097} +; UR12-ELP: !7 = distinct !{!7, !8, !4} +; UR12-ELP: !8 = !{!"llvm.loop.estimated_trip_count", i32 11} +; UR12-EUR: !7 = distinct !{!7, !8} +; UR12-EUR: !8 = !{!"llvm.loop.estimated_trip_count", i32 1} declare void @f(i32) diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll new file mode 100644 index 0000000000000..dafb2a3ca4ed9 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll @@ -0,0 +1,280 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after partial loop unrolling without -unroll-runtime such that +; some iterations' latches become unconditional, which often contradicts the +; original branch weights. +; +; (unroll-complete.ll tests complete loop unrolling, in which the final unrolled +; iteration unconditionally exits (backedge removed). Here, we include cases +; where the final iteration's latch unconditionally continues instead.) +; +; For each case, we check: +; - Iteration frequencies +; - When each is multiplied by the number of original loop bodies that execute +; within it, they should sum to almost exactly the original loop body +; frequency. +; - The only exception is an impossibly high or low original frequency (e.g., +; due to bad profile data), for which there exist no new branch weights that +; can yield that frequency sum. In those cases, we expect the maximum or +; minimum possible frequency. +; - CFGs +; - We verify which branch weights go with which branches and that we did not +; overlook any other branch weights (no extra !prof or branch_weights). +; - We also check the number of original loop bodies (represented by a call to +; @f) that appear within each unrolled iteration. +; - Branch weight metadata +; - Checking frequencies already checks whether the branch weights have the +; expected effect, but we also want to check that we get uniform +; probabilities/weights (same !prof) across the unrolled iteration latches +; when expected. +; - llvm.loop.estimated_trip_count +; - It must be the number of iterations of the unrolled loop required for the +; original loop body to reach its original frequency. +; - It must not be blindly computed from any new latch branch weights. + +; ------------------------------------------------------------------------------ +; Define LIT substitutions. +; +; For verifying that the test code produces the original loop body frequency we +; expect. +; DEFINE: %{bf-fc} = opt %t.ll -S -passes='print' 2>&1 | \ +; DEFINE: FileCheck %s -check-prefixes +; +; For checking the unrolled loop: +; DEFINE: %{ur-bf} = opt %t.ll -S -passes='loop-unroll,print' 2>&1 +; DEFINE: %{fc} = FileCheck %s \ +; DEFINE: -implicit-check-not='llvm.loop.estimated_trip_count' \ +; DEFINE: -implicit-check-not='!prof' \ +; DEFINE: -implicit-check-not='branch_weights' \ +; DEFINE: -implicit-check-not='call void @f' -check-prefixes + +; ------------------------------------------------------------------------------ +; Check cases when the original loop's number of iterations is a run-time +; determined multiple of 10 and the original loop body frequency is 10. +; +; RUN: sed -e s/@N@/%mul10/ -e s/@W@/9/ %s > %t.ll +; +; At compile time, possibilities for that value always include unroll count x 10 +; x N for any integer N >= 1, so the unrolled loop's backedge always remains +; conditional, so we check cases where it becomes unconditional later in this +; test file with the CONST4 config. +; +; Check the original loop body frequency. +; +; RUN: %{bf-fc} MULT-ORIG +; MULT-ORIG: - do.body: float = 10.0, +; +; When the unroll count is odd, every iteration's latch remains conditional, so +; their original probabilities are not contradicted. That is, the original loop +; latch's branch weights remain on all unrolled iterations' latches. +; +; RUN: %{ur-bf} -unroll-count=3 | %{fc} MULT3 +; +; Sums to approximately the original loop body frequency, 10. +; MULT3: - do.body: float = 3.69, +; MULT3: - do.body.1: float = 3.321, +; MULT3: - do.body.2: float = 2.9889, +; +; MULT3: call void @f +; MULT3: br i1 %{{.*}}, label %do.body.1, label %do.end, !prof !0 +; MULT3: call void @f +; MULT3: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0 +; MULT3: call void @f +; MULT3: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1 +; +; MULT3: !0 = !{!"branch_weights", i32 9, i32 1} +; MULT3: !1 = distinct !{!1, !2, !3} +; MULT3: !2 = !{!"llvm.loop.estimated_trip_count", i32 4} +; MULT3: !3 = !{!"llvm.loop.unroll.disable"} +; +; When the unroll count is even, odd-numbered unrolled iterations become +; unconditional, so branch weights must be adjusted. +; +; -unroll-count=2, so there is 1 remaining conditional latch, so the +; implementation can compute uniform weights by solving a linear equation. +; +; RUN: %{ur-bf} -unroll-count=2 | %{fc} MULT2 +; +; Multiply by 2 to get the original loop body frequency, 10. +; FIXME: Should sum to 5.0: +; MULT2: - do.body: float = 10.0, +; +; MULT2: call void @f +; MULT2-NOT: br +; MULT2: call void @f +; MULT2: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1{{$}} +; +; The branch weights imply the estimated trip count is +; (1717986918+429496730)/429496730 = approximately (8+2)/2 = 5. +; FIXME: Or at least they should. +; MULT2: !0 = !{!"branch_weights", i32 9, i32 1} +; MULT2: !1 = distinct !{!1, !2, !3} +; MULT2: !2 = !{!"llvm.loop.estimated_trip_count", i32 5} +; MULT2: !3 = !{!"llvm.loop.unroll.disable"} +; +; -unroll-count=4, so there are 2 remaining conditional latches, so the +; implementation can compute uniform weights using the quadratic formula. +; +; RUN: %{ur-bf} -unroll-count=4 | %{fc} MULT4 +; +; Multiply by 2 and sum to get the original loop body frequency, 10. +; FIXME: Should sum to 5.0: +; MULT4: - do.body: float = 5.2632, +; MULT4: - do.body.2: float = 4.7368, +; +; MULT4: call void @f +; MULT4-NOT: br +; MULT4: call void @f +; MULT4: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0 +; MULT4: call void @f +; MULT4-NOT: br +; MULT4: call void @f +; MULT4: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1 +; +; MULT4 is like applying -unroll-count=2 to MULT2 without converting any +; more conditional latches to unconditional, so MULT2's branch weights work. +; MULT4: !0 = !{!"branch_weights", i32 9, i32 1} +; MULT4: !1 = distinct !{!1, !2, !3} +; MULT4: !2 = !{!"llvm.loop.estimated_trip_count", i32 3} +; MULT4: !3 = !{!"llvm.loop.unroll.disable"} + +; ------------------------------------------------------------------------------ +; Check case when the original loop's number of iterations is a run-time +; determined multiple of 10, the unroll count is even so that odd-numbered +; unrolled iterations become unconditional, and the original loop body frequency +; is 1, which is impossibly low. This case is important to ensure the +; implementation does not malfunction by trying to use negative and possibly +; infinite probabilities to reach the original loop body frequency. +; +; RUN: sed -e s/@N@/%mul10/ -e s/@W@/0/ %s > %t.ll +; +; Check the original loop body frequency. +; +; RUN: %{bf-fc} LOW-ORIG +; LOW-ORIG: - do.body: float = 1.0, +; +; -unroll-count=2, so there is 1 remaining conditional latch. The +; implementation tries to compute uniform weights by solving a linear equation +; but ultimately sets the latch's probability to zero. +; +; RUN: %{ur-bf} -unroll-count=2 | %{fc} LOW2 +; +; Multiply by 2, but the result is greater than the original loop body +; frequency, 1, which is impossibly low. +; LOW2: - do.body: float = 1.0, +; +; LOW2: call void @f +; LOW2-NOT: br +; LOW2: call void @f +; LOW2: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1{{$}} +; +; LOW2: !0 = !{!"branch_weights", i32 0, i32 1} +; LOW2: !1 = distinct !{!1, !2, !3} +; LOW2: !2 = !{!"llvm.loop.estimated_trip_count", i32 1} +; LOW2: !3 = !{!"llvm.loop.unroll.disable"} +; +; -unroll-count=4, so there are 2 remaining conditional latches. The +; implementation tries to compute uniform weights using the quadratic formula +; but ultimately sets both latches' probabilities to zero. +; +; RUN: %{ur-bf} -unroll-count=4 | %{fc} LOW4 +; +; Multiply by 2 and sum, but the result is greater than the original loop body +; frequency, 1, which is impossibly low. +; LOW4: - do.body: float = 1.0, +; LOW4: - do.body.2: float = 0.0{{(0000[0-9]*)?}}, +; +; LOW4: call void @f +; LOW4-NOT: br +; LOW4: call void @f +; LOW4: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0 +; LOW4: call void @f +; LOW4-NOT: br +; LOW4: call void @f +; LOW4: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1 +; +; LOW4: !0 = !{!"branch_weights", i32 0, i32 1} +; LOW4: !1 = distinct !{!1, !2, !3} +; LOW4: !2 = !{!"llvm.loop.estimated_trip_count", i32 1} +; LOW4: !3 = !{!"llvm.loop.unroll.disable"} + +; ------------------------------------------------------------------------------ +; Check cases when the original loop's number of iterations is a constant 10 and +; the original loop body frequency is 10. +; +; RUN: sed -e s/@N@/10/g -e s/@W@/9/ %s > %t.ll +; +; Because we test only partial unrolling, there is always exactly one unrolled +; iteration that can possibly exit, so only its latch can remain conditional. +; Because there is only one, its branch weights can be computed with a simple +; formula. +; +; Check the original loop body frequency. +; +; RUN: %{bf-fc} CONST-ORIG +; CONST-ORIG: - do.body: float = 10.0, +; +; Check when the unrolled loop's backedge remains conditional. +; +; RUN: %{ur-bf} -unroll-count=2 | %{fc} CONST2 +; +; Multiply by 2 to get the original loop body frequency, 10. +; FIXME: Should be 5.0: +; CONST2: - do.body: float = 10.0, +; +; CONST2: call void @f +; CONST2-NOT: br: +; CONST2: call void @f +; CONST2: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1 +; +; Like MULT2. +; CONST2: !0 = !{!"branch_weights", i32 9, i32 1} +; CONST2: !1 = distinct !{!1, !2, !3} +; CONST2: !2 = !{!"llvm.loop.estimated_trip_count", i32 5} +; CONST2: !3 = !{!"llvm.loop.unroll.disable"} +; +; Check when the unrolled loop's backedge unconditionally continues. +; +; RUN: %{ur-bf} -unroll-count=4 | %{fc} CONST4 +; +; Multiply by 2 and sum to get the original loop body frequency, 10. +; FIXME: Should sum to 5.0: +; CONST4: - do.body: float = 10.0, +; CONST4: - do.body.2: float = 9.0, +; +; CONST4: call void @f +; CONST4-NOT: br +; CONST4: call void @f +; CONST4: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0 +; CONST4: call void @f +; CONST4-NOT: br +; CONST4: call void @f +; CONST4: br label %do.body, !llvm.loop !1 +; +; There is no llvm.loop.estimated_trip_count because the unrolled loop's latch +; in do.body.2 unconditionally continues. The branch weights on do.body's +; branch imply do.body continues twice and then exits once, thus executing the +; original loop body 10 times. +; CONST4: !0 = !{!"branch_weights", i32 9, i32 1} +; CONST4: !1 = distinct !{!1, !2} +; CONST4: !2 = !{!"llvm.loop.unroll.disable"} + +declare void @f(i32) + +define void @test(i32 %n) { +entry: + %mul10 = mul i32 %n, 10 + br label %do.body + +do.body: + %i = phi i32 [ 0, %entry ], [ %next, %do.body ] + call void @f(i32 %i) + %next = add i32 %i, 1 + %c = icmp ne i32 %next, @N@ + br i1 %c, label %do.body, label %do.end, !prof !0 + +do.end: + ret void +} + +; Loop body frequency is @W@ + 1. +!0 = !{!"branch_weights", i32 @W@, i32 1} diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll index af5342c5e35cd..ea6f4a4180fc9 100644 --- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll @@ -1,5 +1,6 @@ ; Test branch weight metadata, estimated trip count metadata, and block -; frequencies after partial loop unrolling without -unroll-runtime. +; frequencies after partial loop unrolling without -unroll-runtime and without +; converting any iteration's latch to an unconditional branch. ; ------------------------------------------------------------------------------ ; RUN: opt < %s -S -passes='print' 2>&1 | \ diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll index 14f6da42df6b1..89915d29f5921 100644 --- a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll +++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll @@ -1,73 +1,97 @@ ; Check that a loop probability of one (indicating an always infinite loop) does ; not crash or otherwise break LoopUnroll behavior when it tries to compute new ; probabilities from it. -; -; That case indicates an always infinite loop. A remainder loop cannot be -; calculated at run time when the original loop is infinite as infinity % -; UnrollCount is undefined, so consistent remainder loop probabilities are -; difficult or impossible to reason about. The implementation chooses -; probabilities indicating that all remainder loop iterations will always -; execute. - -; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S -; DEFINE: %{rt} = %{unroll} -unroll-runtime - -; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL -; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG -; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG - -define void @test(i32 %n) { -entry: - br label %loop -loop: - %i = phi i32 [ 0, %entry ], [ %inc, %loop ] - %inc = add i32 %i, 1 - %c = icmp slt i32 %inc, %n - br i1 %c, label %loop, label %end, !prof !0 +; DEFINE: %{unroll} = opt < %t.ll -unroll-count=3 -passes=loop-unroll -S +; DEFINE: %{fc} = FileCheck %s \ +; DEFINE: -implicit-check-not='llvm.loop.estimated_trip_count' \ +; DEFINE: -implicit-check-not='!prof' \ +; DEFINE: -implicit-check-not='branch_weights' \ +; DEFINE: -implicit-check-not='call void @f' -check-prefixes -end: - ret void -} +; ------------------------------------------------------------------------------ +; A partially unrolled loop remains infinite. +; +; RUN: sed -e s/@N@/%n/ %s > %t.ll +; RUN: %{unroll} | %{fc} PART-ALL-COND +; +; PART-ALL-COND: call void @f +; PART-ALL-COND: br i1 %{{.*}}, label %loop.1, label %end, !prof !0 +; PART-ALL-COND: call void @f +; PART-ALL-COND: br i1 %{{.*}}, label %loop.2, label %end, !prof !0 +; PART-ALL-COND: call void @f +; PART-ALL-COND: br i1 %{{.*}}, label %loop, label %end, !prof !0, !llvm.loop !1 +; PART-ALL-COND: !0 = !{!"branch_weights", i32 1, i32 0} +; ------------------------------------------------------------------------------ +; A partially unrolled loop remains infinite even if some iterations' latches +; become unconditional. +; +; RUN: sed -e s/@N@/5/ %s > %t.ll +; RUN: %{unroll} | %{fc} PART-SOME-COND +; +; PART-SOME-COND: call void @f +; PART-SOME-COND-NOT: br +; PART-SOME-COND: call void @f +; PART-SOME-COND: br i1 %{{.*}}, label %loop.2, label %end, !prof !0 +; PART-SOME-COND: call void @f +; PART-SOME-COND: br label %loop, !llvm.loop !1 +; PART-SOME-COND: !0 = !{!"branch_weights", i32 1, i32 0} -!0 = !{!"branch_weights", i32 1, i32 0} +; ------------------------------------------------------------------------------ +; A completely unrolled loop cannot be infinite, so consistent unrolled loop +; probabilities are impossible. The implementation chooses probabilities +; indicating that all unrolled loop iterations will always execute. +; +; RUN: sed -e s/@N@/%max3/ %s > %t.ll +; RUN: %{unroll} | %{fc} COMPLETE-SOME-COND +; +; COMPLETE-SOME-COND: call void @f +; COMPLETE-SOME-COND: br i1 %{{.*}}, label %loop.1, label %end, !prof !0 +; COMPLETE-SOME-COND: call void @f +; COMPLETE-SOME-COND: br i1 %{{.*}}, label %loop.2, label %end, !prof !0 +; COMPLETE-SOME-COND: call void @f +; COMPLETE-SOME-COND: br label %end +; COMPLETE-SOME-COND: !0 = !{!"branch_weights", i32 1, i32 0} -; UNROLL: define void @test(i32 %n) { -; UNROLL: entry: -; UNROLL: br label %loop -; UNROLL: loop: -; UNROLL: br i1 %c, label %loop.1, label %end, !prof !0 -; UNROLL: loop.1: -; UNROLL: br i1 %c.1, label %loop.2, label %end, !prof !0 -; UNROLL: loop.2: -; UNROLL: br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1 -; UNROLL-NOT: loop.3 -; UNROLL: end: -; UNROLL: ret void -; UNROLL: } -; -; Infinite unrolled loop. -; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0} +; ------------------------------------------------------------------------------ +; A completely unrolled loop with no remaining conditional latches gives the +; implementation no probabilities to set. Check that it still behaves. +; +; RUN: sed -e s/@N@/3/ %s > %t.ll +; RUN: %{unroll} | %{fc} COMPLETE-NO-COND +; +; COMPLETE-NO-COND: call void @f +; COMPLETE-NO-COND-NOT: br +; COMPLETE-NO-COND: call void @f +; COMPLETE-NO-COND-NOT: br +; COMPLETE-NO-COND: call void @f -; EPILOG: define void @test(i32 %n) { -; EPILOG: entry: -; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0 -; EPILOG: entry.new: -; EPILOG: br label %loop -; EPILOG: loop: -; EPILOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1 -; EPILOG: end.unr-lcssa: -; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1 -; EPILOG: loop.epil.preheader: -; EPILOG: br label %loop.epil -; EPILOG: loop.epil: -; EPILOG: br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4 -; EPILOG: end.epilog-lcssa: -; EPILOG: br label %end -; EPILOG: end: -; EPILOG: ret void -; EPILOG: } +; ------------------------------------------------------------------------------ +; A remainder loop cannot be calculated at run time when the original loop is +; infinite as infinity % UnrollCount is undefined, so consistent remainder loop +; probabilities are difficult or impossible to reason about. The implementation +; chooses probabilities indicating that all remainder loop iterations will +; always execute. +; +; RUN: sed -e s/@N@/%n/ %s > %t.ll +; DEFINE: %{rt} = %{unroll} -unroll-runtime +; RUN: %{rt} -unroll-runtime-epilog=true | %{fc} EPILOG +; RUN: %{rt} -unroll-runtime-epilog=false | %{fc} PROLOG +; +; Unrolled loop guard, body, and latch. +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0 +; EPILOG: call void @f +; EPILOG-NOT: br +; EPILOG: call void @f +; EPILOG-NOT: br +; EPILOG: call void @f +; EPILOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1 +; +; Epilogue guard, body, and latch. +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1 +; EPILOG: call void @f +; EPILOG: br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4 ; ; Unrolled loop guard: Unrolled loop is always entered. ; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648} @@ -78,27 +102,20 @@ end: ; ; Epilogue loop latch: Epilogue loop executes both of its 2 iterations. ; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824} - -; PROLOG: define void @test(i32 %n) { -; PROLOG: entry: -; PROLOG: br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0 -; PROLOG: loop.prol.preheader: -; PROLOG: br label %loop.prol -; PROLOG: loop.prol: -; PROLOG: br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1 -; PROLOG: loop.prol.loopexit.unr-lcssa: -; PROLOG: br label %loop.prol.loopexit -; PROLOG: loop.prol.loopexit: -; PROLOG: br i1 %{{.*}}, label %end, label %entry.new, !prof !0 -; PROLOG: entry.new: -; PROLOG: br label %loop -; PROLOG: loop: -; PROLOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4 -; PROLOG: end.unr-lcssa: -; PROLOG: br label %end -; PROLOG: end: -; PROLOG: ret void -; PROLOG: } +; +; Prologue guard, body, and latch. +; PROLOG: br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0 +; PROLOG: call void @f +; PROLOG: br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1 +; +; Unrolled loop guard, body, and latch. +; PROLOG: br i1 %{{.*}}, label %end, label %entry.new, !prof !0 +; PROLOG: call void @f +; PROLOG-NOT: br +; PROLOG: call void @f +; PROLOG-NOT: br +; PROLOG: call void @f +; PROLOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4 ; ; FIXME: Branch weights still need to be fixed in the case of prologues (issue ; #135812), so !0 and !1 do not yet match their comments below. When we do @@ -114,3 +131,23 @@ end: ; ; Unrolled loop latch: Unrolled loop is infinite. ; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0} + +declare void @f(i32) + +define void @test(i32 %n) { +entry: + %max3 = call i32 @llvm.umin.i32(i32 %n, i32 3) + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + call void @f(i32 %i) + %inc = add i32 %i, 1 + %c = icmp slt i32 %inc, @N@ + br i1 %c, label %loop, label %end, !prof !0 + +end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 0}