Skip to content

Commit

Permalink
Compute estimated trip counts for multiple exit loops
Browse files Browse the repository at this point in the history
This change allows us to estimate trip count from profile metadata for all multiple exit loops. We still do the estimate only from the latch, but that's fine as it causes us to over estimate the trip count at worst.

Reviewing the uses of the API, all but one are cases where we restrict a loop transformation (unroll, and vectorize respectively) when we know the trip count is short enough. So, as a result, the change makes these passes strictly less aggressive. The test change illustrates a case where we'd previously have runtime unrolled a loop which ran fewer iterations than the unroll factor. This is definitely unprofitable.

The one case where an upper bound on estimate trip count could drive a more aggressive transform is peeling, and I duplicated the logic being removed from the generic estimation there to keep it the same. The resulting heuristic makes no sense and should probably be immediately removed, but we can do that in a separate change.

This was noticed when analyzing regressions on D113939.

I plan to come back and incorporate estimated trip counts from other exits, but that's a minor improvement which can follow separately.

Differential Revision: https://reviews.llvm.org/D115362
  • Loading branch information
preames committed Dec 9, 2021
1 parent 06ca0a2 commit 2d31b02
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 138 deletions.
27 changes: 27 additions & 0 deletions llvm/lib/Transforms/Utils/LoopPeel.cpp
Expand Up @@ -333,6 +333,31 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
return DesiredPeelCount;
}

/// This "heuristic" exactly matches implicit behavior which used to exist
/// inside getLoopEstimatedTripCount. It was added here to keep an
/// improvement inside that API from causing peeling to become more agressive.
/// This should probably be removed.
static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
BasicBlock *Latch = L->getLoopLatch();
if (!Latch)
return true;

BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
return true;

assert((LatchBR->getSuccessor(0) == L->getHeader() ||
LatchBR->getSuccessor(1) == L->getHeader()) &&
"At least one edge out of the latch must go to the header");

SmallVector<BasicBlock *, 4> ExitBlocks;
L->getUniqueNonLatchExitBlocks(ExitBlocks);
return any_of(ExitBlocks, [](const BasicBlock *EB) {
return !EB->getTerminatingDeoptimizeCall();
});
}


// Return the number of iterations we want to peel off.
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::PeelingPreferences &PP,
Expand Down Expand Up @@ -436,6 +461,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
// We only do this in the presence of profile information, since otherwise
// our estimates of the trip count are not reliable enough.
if (L->getHeader()->getParent()->hasProfileData()) {
if (violatesLegacyMultiExitLoopCheck(L))
return;
Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
if (!PeelCount)
return;
Expand Down
22 changes: 9 additions & 13 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Expand Up @@ -773,8 +773,8 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
}


/// Checks if \p L has single exit through latch block except possibly
/// "deoptimizing" exits. Returns branch instruction terminating the loop
/// Checks if \p L has an exiting latch branch. There may also be other
/// exiting blocks. Returns branch instruction terminating the loop
/// latch if above check is successful, nullptr otherwise.
static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
BasicBlock *Latch = L->getLoopLatch();
Expand All @@ -789,21 +789,16 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
LatchBR->getSuccessor(1) == L->getHeader()) &&
"At least one edge out of the latch must go to the header");

SmallVector<BasicBlock *, 4> ExitBlocks;
L->getUniqueNonLatchExitBlocks(ExitBlocks);
if (any_of(ExitBlocks, [](const BasicBlock *EB) {
return !EB->getTerminatingDeoptimizeCall();
}))
return nullptr;

return LatchBR;
}

Optional<unsigned>
llvm::getLoopEstimatedTripCount(Loop *L,
unsigned *EstimatedLoopInvocationWeight) {
// Support loops with an exiting latch and other existing exists only
// deoptimize.
// Currently we take the estimate exit count only from the loop latch,
// ignoring other exiting blocks. This can overestimate the trip count
// if we exit through another exit, but can never underestimate it.
// TODO: incorporate information from other exits
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
if (!LatchBranch)
return None;
Expand Down Expand Up @@ -834,8 +829,9 @@ llvm::getLoopEstimatedTripCount(Loop *L,

bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
unsigned EstimatedloopInvocationWeight) {
// Support loops with an exiting latch and other existing exists only
// deoptimize.
// At the moment, we currently support changing the estimate trip count of
// the latch branch only. We could extend this API to manipulate estimated
// trip counts for any exit.
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
if (!LatchBranch)
return false;
Expand Down
135 changes: 10 additions & 125 deletions llvm/test/Transforms/LoopUnroll/runtime-multiexit-heuristic.ll
Expand Up @@ -608,139 +608,24 @@ define i32 @test3(i32* nocapture %a, i64 %n) !prof !{!"function_entry_count", i6
;
; ENABLED-LABEL: @test3(
; ENABLED-NEXT: entry:
; ENABLED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -1
; ENABLED-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
; ENABLED-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
; ENABLED-NEXT: br i1 [[TMP1]], label [[LATCHEXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
; ENABLED: entry.new:
; ENABLED-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; ENABLED-NEXT: br label [[HEADER:%.*]]
; ENABLED: header:
; ENABLED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[LATCH_7:%.*]] ]
; ENABLED-NEXT: [[SUM_02:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[ADD_7:%.*]], [[LATCH_7]] ]
; ENABLED-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], [[LATCH_7]] ]
; ENABLED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; ENABLED-NEXT: [[SUM_02:%.*]] = phi i32 [ [[ADD:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK:%.*]]
; ENABLED: for.exiting_block:
; ENABLED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP]], label [[OTHEREXIT_LOOPEXIT:%.*]], label [[LATCH:%.*]]
; ENABLED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N:%.*]], 42
; ENABLED-NEXT: br i1 [[CMP]], label [[OTHEREXIT:%.*]], label [[LATCH]]
; ENABLED: latch:
; ENABLED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
; ENABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; ENABLED-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[SUM_02]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
; ENABLED-NEXT: [[NITER_NEXT:%.*]] = add nuw nsw i64 [[NITER]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_1:%.*]]
; ENABLED: for.exiting_block.1:
; ENABLED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_1]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_1:%.*]]
; ENABLED: latch.1:
; ENABLED-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; ENABLED-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; ENABLED-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP3]], [[ADD]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT]], 1
; ENABLED-NEXT: [[NITER_NEXT_1:%.*]] = add nuw nsw i64 [[NITER_NEXT]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_2:%.*]]
; ENABLED: for.exiting_block.2:
; ENABLED-NEXT: [[CMP_2:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_2]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_2:%.*]]
; ENABLED: latch.2:
; ENABLED-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; ENABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; ENABLED-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP4]], [[ADD_1]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
; ENABLED-NEXT: [[NITER_NEXT_2:%.*]] = add nuw nsw i64 [[NITER_NEXT_1]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_3:%.*]]
; ENABLED: for.exiting_block.3:
; ENABLED-NEXT: [[CMP_3:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_3]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_3:%.*]]
; ENABLED: latch.3:
; ENABLED-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; ENABLED-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; ENABLED-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP5]], [[ADD_2]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
; ENABLED-NEXT: [[NITER_NEXT_3:%.*]] = add nuw nsw i64 [[NITER_NEXT_2]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_4:%.*]]
; ENABLED: for.exiting_block.4:
; ENABLED-NEXT: [[CMP_4:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_4]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_4:%.*]]
; ENABLED: latch.4:
; ENABLED-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
; ENABLED-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
; ENABLED-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP6]], [[ADD_3]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_3]], 1
; ENABLED-NEXT: [[NITER_NEXT_4:%.*]] = add nuw nsw i64 [[NITER_NEXT_3]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_5:%.*]]
; ENABLED: for.exiting_block.5:
; ENABLED-NEXT: [[CMP_5:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_5]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_5:%.*]]
; ENABLED: latch.5:
; ENABLED-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_4]]
; ENABLED-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
; ENABLED-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP7]], [[ADD_4]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_4]], 1
; ENABLED-NEXT: [[NITER_NEXT_5:%.*]] = add nuw nsw i64 [[NITER_NEXT_4]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_6:%.*]]
; ENABLED: for.exiting_block.6:
; ENABLED-NEXT: [[CMP_6:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_6]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_6:%.*]]
; ENABLED: latch.6:
; ENABLED-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_5]]
; ENABLED-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
; ENABLED-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP8]], [[ADD_5]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_5]], 1
; ENABLED-NEXT: [[NITER_NEXT_6:%.*]] = add nuw nsw i64 [[NITER_NEXT_5]], 1
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_7:%.*]]
; ENABLED: for.exiting_block.7:
; ENABLED-NEXT: [[CMP_7:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_7]], label [[OTHEREXIT_LOOPEXIT]], label [[LATCH_7]]
; ENABLED: latch.7:
; ENABLED-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_6]]
; ENABLED-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
; ENABLED-NEXT: [[ADD_7]] = add nsw i32 [[TMP9]], [[ADD_6]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_7]] = add i64 [[INDVARS_IV_NEXT_6]], 1
; ENABLED-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER_NEXT_6]], 1
; ENABLED-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; ENABLED-NEXT: br i1 [[NITER_NCMP_7]], label [[LATCHEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[HEADER]], !prof [[PROF4:![0-9]+]]
; ENABLED: latchexit.unr-lcssa.loopexit:
; ENABLED-NEXT: [[SUM_0_LCSSA_PH_PH:%.*]] = phi i32 [ [[ADD_7]], [[LATCH_7]] ]
; ENABLED-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], [[LATCH_7]] ]
; ENABLED-NEXT: [[SUM_02_UNR_PH:%.*]] = phi i32 [ [[ADD_7]], [[LATCH_7]] ]
; ENABLED-NEXT: br label [[LATCHEXIT_UNR_LCSSA]]
; ENABLED: latchexit.unr-lcssa:
; ENABLED-NEXT: [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
; ENABLED-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
; ENABLED-NEXT: [[SUM_02_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_02_UNR_PH]], [[LATCHEXIT_UNR_LCSSA_LOOPEXIT]] ]
; ENABLED-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; ENABLED-NEXT: br i1 [[LCMP_MOD]], label [[HEADER_EPIL_PREHEADER:%.*]], label [[LATCHEXIT:%.*]]
; ENABLED: header.epil.preheader:
; ENABLED-NEXT: br label [[HEADER_EPIL:%.*]]
; ENABLED: header.epil:
; ENABLED-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_EPIL:%.*]], [[LATCH_EPIL:%.*]] ], [ [[INDVARS_IV_UNR]], [[HEADER_EPIL_PREHEADER]] ]
; ENABLED-NEXT: [[SUM_02_EPIL:%.*]] = phi i32 [ [[ADD_EPIL:%.*]], [[LATCH_EPIL]] ], [ [[SUM_02_UNR]], [[HEADER_EPIL_PREHEADER]] ]
; ENABLED-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, [[HEADER_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], [[LATCH_EPIL]] ]
; ENABLED-NEXT: br label [[FOR_EXITING_BLOCK_EPIL:%.*]]
; ENABLED: for.exiting_block.epil:
; ENABLED-NEXT: [[CMP_EPIL:%.*]] = icmp eq i64 [[N]], 42
; ENABLED-NEXT: br i1 [[CMP_EPIL]], label [[OTHEREXIT_LOOPEXIT2:%.*]], label [[LATCH_EPIL]]
; ENABLED: latch.epil:
; ENABLED-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_EPIL]]
; ENABLED-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_EPIL]], align 4
; ENABLED-NEXT: [[ADD_EPIL]] = add nsw i32 [[TMP10]], [[SUM_02_EPIL]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add i64 [[INDVARS_IV_EPIL]], 1
; ENABLED-NEXT: [[EXITCOND_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[N]]
; ENABLED-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; ENABLED-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; ENABLED-NEXT: br i1 [[EPIL_ITER_CMP]], label [[HEADER_EPIL]], label [[LATCHEXIT_EPILOG_LCSSA:%.*]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
; ENABLED: latchexit.epilog-lcssa:
; ENABLED-NEXT: [[SUM_0_LCSSA_PH1:%.*]] = phi i32 [ [[ADD_EPIL]], [[LATCH_EPIL]] ]
; ENABLED-NEXT: br label [[LATCHEXIT]]
; ENABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; ENABLED-NEXT: [[ADD]] = add nsw i32 [[TMP0]], [[SUM_02]]
; ENABLED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; ENABLED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
; ENABLED-NEXT: br i1 [[EXITCOND]], label [[LATCHEXIT:%.*]], label [[HEADER]], !prof [[PROF4:![0-9]+]]
; ENABLED: latchexit:
; ENABLED-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_0_LCSSA_PH]], [[LATCHEXIT_UNR_LCSSA]] ], [ [[SUM_0_LCSSA_PH1]], [[LATCHEXIT_EPILOG_LCSSA]] ]
; ENABLED-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[ADD]], [[LATCH]] ]
; ENABLED-NEXT: ret i32 [[SUM_0_LCSSA]]
; ENABLED: otherexit.loopexit:
; ENABLED-NEXT: br label [[OTHEREXIT:%.*]]
; ENABLED: otherexit.loopexit2:
; ENABLED-NEXT: br label [[OTHEREXIT]]
; ENABLED: otherexit:
; ENABLED-NEXT: ret i32 57
;
Expand Down

0 comments on commit 2d31b02

Please sign in to comment.