Skip to content

Commit

Permalink
[LoopUnroll] Implement profile-based loop peeling
Browse files Browse the repository at this point in the history
This implements PGO-driven loop peeling.

The basic idea is that when the average dynamic trip-count of a loop is known,
based on PGO, to be low, we can expect a performance win by peeling off the
first several iterations of that loop.
Unlike unrolling based on a known trip count, or a trip count multiple, this
doesn't save us the conditional check and branch on each iteration. However,
it does allow us to simplify the straight-line code we get (constant-folding,
etc.). This is important given that we know that we will usually only hit this
code, and not the actual loop.

This is currently disabled by default.

Differential Revision: https://reviews.llvm.org/D25963

llvm-svn: 288274
  • Loading branch information
mkuperst committed Nov 30, 2016
1 parent aa8b28e commit b151a64
Show file tree
Hide file tree
Showing 9 changed files with 635 additions and 33 deletions.
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ class TargetTransformInfo {
/// transformation will select an unrolling factor based on the current cost
/// threshold and other factors.
unsigned Count;
/// A forced peeling factor (the number of bodied of the original loop
/// that should be peeled off before the loop body). When set to 0, the
/// unrolling transformation will select a peeling factor based on profile
/// information and other factors.
unsigned PeelCount;
/// Default unroll count for loops with run-time trip count.
unsigned DefaultUnrollRuntimeCount;
// Set the maximum unrolling factor. The unrolling factor may be selected
Expand Down Expand Up @@ -298,6 +303,8 @@ class TargetTransformInfo {
bool Force;
/// Allow using trip count upper bound to unroll loops.
bool UpperBound;
/// Allow peeling off loop iterations for loops with low dynamic tripcount.
bool AllowPeeling;
};

/// \brief Get target-customized preferences for the generic loop unrolling
Expand Down
13 changes: 11 additions & 2 deletions llvm/include/llvm/Transforms/Utils/UnrollLoop.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#ifndef LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
#define LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H

// Needed because we can't forward-declare the nested struct
// TargetTransformInfo::UnrollingPreferences
#include "llvm/Analysis/TargetTransformInfo.h"

namespace llvm {

Expand All @@ -33,8 +36,8 @@ class ScalarEvolution;
bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
bool AllowRuntime, bool AllowExpensiveTripCount,
bool PreserveCondBr, bool PreserveOnlyFirst,
unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC,
unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, bool PreserveLCSSA);

bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
Expand All @@ -43,6 +46,12 @@ bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
ScalarEvolution *SE, DominatorTree *DT,
bool PreserveLCSSA);

void computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP);

bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, bool PreserveLCSSA);

MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
}

Expand Down
49 changes: 35 additions & 14 deletions llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstVisitor.h"
Expand Down Expand Up @@ -108,6 +107,11 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
"threshold, the loop is considered as flat and will be less "
"aggressively unrolled."));

static cl::opt<bool>
UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
cl::desc("Allows loops to be peeled when the dynamic "
"trip count is known to be low."));

/// A magic value for use with the Threshold parameter to indicate
/// that the loop unroll should be performed regardless of how much
/// code expansion would result.
Expand All @@ -129,6 +133,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.PartialThreshold = UP.Threshold;
UP.PartialOptSizeThreshold = 0;
UP.Count = 0;
UP.PeelCount = 0;
UP.DefaultUnrollRuntimeCount = 8;
UP.MaxCount = UINT_MAX;
UP.FullUnrollMaxCount = UINT_MAX;
Expand All @@ -139,6 +144,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.AllowExpensiveTripCount = false;
UP.Force = false;
UP.UpperBound = false;
UP.AllowPeeling = false;

// Override with any target specific settings
TTI.getUnrollingPreferences(L, UP);
Expand Down Expand Up @@ -171,6 +177,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Runtime = UnrollRuntime;
if (UnrollMaxUpperBound == 0)
UP.UpperBound = false;
if (UnrollAllowPeeling.getNumOccurrences() > 0)
UP.AllowPeeling = UnrollAllowPeeling;

// Apply user values provided by argument
if (UserThreshold.hasValue()) {
Expand Down Expand Up @@ -754,16 +762,6 @@ static bool computeUnrollCount(
bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
PragmaEnableUnroll || UserUnrollCount;

// Check if the runtime trip count is too small when profile is available.
if (L->getHeader()->getParent()->getEntryCount() && TripCount == 0) {
if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
if (*ProfileTripCount < FlatLoopTripCountThreshold)
return false;
else
UP.AllowExpensiveTripCount = true;
}
}

if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits. Set thresholds to at least the PragmaThreshold value
Expand Down Expand Up @@ -878,12 +876,31 @@ static bool computeUnrollCount(
<< "Unable to fully unroll loop as directed by unroll(full) pragma "
"because loop has a runtime trip count.");

// 5th priority is runtime unrolling.
// 5th priority is loop peeling
computePeelCount(L, LoopSize, UP);
if (UP.PeelCount) {
UP.Runtime = false;
UP.Count = 1;
return ExplicitUnroll;
}

// 6th priority is runtime unrolling.
// Don't unroll a runtime trip count loop when it is disabled.
if (HasRuntimeUnrollDisablePragma(L)) {
UP.Count = 0;
return false;
}

// Check if the runtime trip count is too small when profile is available.
if (L->getHeader()->getParent()->getEntryCount()) {
if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
if (*ProfileTripCount < FlatLoopTripCountThreshold)
return false;
else
UP.AllowExpensiveTripCount = true;
}
}

// Reduce count based on the type of unrolling and the threshold values.
UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
if (!UP.Runtime) {
Expand Down Expand Up @@ -1042,13 +1059,17 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
// Unroll the loop.
if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
TripMultiple, LI, SE, &DT, &AC, &ORE, PreserveLCSSA))
TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
PreserveLCSSA))
return false;

// If loop has an unroll count pragma or unrolled by explicitly set count
// mark loop as unrolled to prevent unrolling beyond that requested.
if (IsCountSetExplicitly)
// If the loop was peeled, we already "used up" the profile information
// we had, so we don't want to unroll or peel again.
if (IsCountSetExplicitly || UP.PeelCount)
SetLoopAlreadyUnrolled(L);

return true;
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ add_llvm_library(LLVMTransformUtils
Local.cpp
LoopSimplify.cpp
LoopUnroll.cpp
LoopUnrollPeel.cpp
LoopUnrollRuntime.cpp
LoopUtils.cpp
LoopVersioning.cpp
Expand Down
36 changes: 26 additions & 10 deletions llvm/lib/Transforms/Utils/LoopUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,16 +202,21 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
/// AllowExpensiveTripCount is false.
///
/// If we want to perform PGO-based loop peeling, PeelCount is set to the
/// number of iterations we want to peel off.
///
/// The LoopInfo Analysis that is passed will be kept consistent.
///
/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
/// DominatorTree if they are non-null.
bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
bool AllowRuntime, bool AllowExpensiveTripCount,
bool PreserveCondBr, bool PreserveOnlyFirst,
unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) {
unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
bool PreserveLCSSA) {

BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader) {
DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
Expand Down Expand Up @@ -257,9 +262,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
if (TripCount != 0 && Count > TripCount)
Count = TripCount;

// Don't enter the unroll code if there is nothing to do. This way we don't
// need to support "partial unrolling by 1".
if (TripCount == 0 && Count < 2)
// Don't enter the unroll code if there is nothing to do.
if (TripCount == 0 && Count < 2 && PeelCount == 0)
return false;

assert(Count > 0);
Expand Down Expand Up @@ -288,6 +292,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
// flag is specified.
bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);

assert((!RuntimeTripCount || !PeelCount) &&
"Did not expect runtime trip-count unrolling "
"and peeling for the same loop");

if (PeelCount)
peelLoop(L, PeelCount, LI, SE, DT, PreserveLCSSA);

// Loops containing convergent instructions must have a count that divides
// their TripMultiple.
DEBUG(
Expand All @@ -301,9 +312,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
"Unroll count must divide trip multiple if loop contains a "
"convergent operation.");
});
// Don't output the runtime loop remainder if Count is a multiple of
// TripMultiple. Such a remainder is never needed, and is unsafe if the loop
// contains a convergent instruction.

if (RuntimeTripCount && TripMultiple % Count != 0 &&
!UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
UnrollRuntimeEpilog, LI, SE, DT,
Expand Down Expand Up @@ -339,6 +348,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
L->getHeader())
<< "completely unrolled loop with "
<< NV("UnrollCount", TripCount) << " iterations");
} else if (PeelCount) {
DEBUG(dbgs() << "PEELING loop %" << Header->getName()
<< " with iteration count " << PeelCount << "!\n");
ORE->emit(OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
L->getHeader())
<< " peeled loop by " << NV("PeelCount", PeelCount)
<< " iterations");
} else {
OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
L->getHeader());
Expand Down Expand Up @@ -628,7 +644,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
DEBUG(DT->verifyDomTree());

// Simplify any new induction variables in the partially unrolled loop.
if (SE && !CompletelyUnroll) {
if (SE && !CompletelyUnroll && Count > 1) {
SmallVector<WeakVH, 16> DeadInsts;
simplifyLoopIVs(L, SE, DT, LI, DeadInsts);

Expand Down
Loading

0 comments on commit b151a64

Please sign in to comment.