Skip to content

Commit

Permalink
Improve PGO support for the new inliner
Browse files Browse the repository at this point in the history
This adds the following to the new PM based inliner in PGO mode:

* Use block frequency analysis to derive callsite's profile count and use
that to adjust thresholds of hot and cold callsites.

* Incrementally update the BFI of the caller after a callee gets inlined
into it. This incremental update is only within an invocation of the run
method - BFI is not preserved across calls to run.
Update the function entry count of the callee after inlining it into a
caller.

* I've tuned the thresholds for the hot and cold callsites using a hacked
up version of the old inliner that explicitly computes BFI on a set of
internal benchmarks and spec. Once the new PM based pipeline stabilizes
(IIRC Chandler mentioned there are known issues) I'll benchmark this
again and adjust the thresholds if required.
Inliner PGO support.

Differential revision: https://reviews.llvm.org/D28331

llvm-svn: 292666
  • Loading branch information
Easwaran Raman committed Jan 20, 2017
1 parent 760ad4d commit 12585b0
Show file tree
Hide file tree
Showing 13 changed files with 408 additions and 31 deletions.
6 changes: 6 additions & 0 deletions llvm/include/llvm/Analysis/InlineCost.h
Expand Up @@ -21,6 +21,7 @@

namespace llvm {
class AssumptionCacheTracker;
class BlockFrequencyInfo;
class CallSite;
class DataLayout;
class Function;
Expand Down Expand Up @@ -137,6 +138,9 @@ struct InlineParams {

/// Threshold to use when the callsite is considered hot.
Optional<int> HotCallSiteThreshold;

/// Threshold to use when the callsite is considered cold.
Optional<int> ColdCallSiteThreshold;
};

/// Generate the parameters to tune the inline cost analysis based only on the
Expand Down Expand Up @@ -171,6 +175,7 @@ InlineCost
getInlineCost(CallSite CS, const InlineParams &Params,
TargetTransformInfo &CalleeTTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
ProfileSummaryInfo *PSI);

/// \brief Get an InlineCost with the callee explicitly specified.
Expand All @@ -182,6 +187,7 @@ InlineCost
getInlineCost(CallSite CS, Function *Callee, const InlineParams &Params,
TargetTransformInfo &CalleeTTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
ProfileSummaryInfo *PSI);

/// \brief Minimal filter to detect invalid constructs for inlining.
Expand Down
9 changes: 7 additions & 2 deletions llvm/include/llvm/Transforms/Utils/Cloning.h
Expand Up @@ -33,6 +33,7 @@ namespace llvm {

class AllocaInst;
class BasicBlock;
class BlockFrequencyInfo;
class CallInst;
class CallGraph;
class DominatorTree;
Expand Down Expand Up @@ -173,13 +174,17 @@ class InlineFunctionInfo {
public:
explicit InlineFunctionInfo(CallGraph *cg = nullptr,
std::function<AssumptionCache &(Function &)>
*GetAssumptionCache = nullptr)
: CG(cg), GetAssumptionCache(GetAssumptionCache) {}
*GetAssumptionCache = nullptr,
BlockFrequencyInfo *CallerBFI = nullptr,
BlockFrequencyInfo *CalleeBFI = nullptr)
: CG(cg), GetAssumptionCache(GetAssumptionCache), CallerBFI(CallerBFI),
CalleeBFI(CalleeBFI) {}

/// CG - If non-null, InlineFunction will update the callgraph to reflect the
/// changes it makes.
CallGraph *CG;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
BlockFrequencyInfo *CallerBFI, *CalleeBFI;

/// StaticAllocas - InlineFunction fills this in with all static allocas that
/// get copied into the caller.
Expand Down
57 changes: 40 additions & 17 deletions llvm/lib/Analysis/InlineCost.cpp
Expand Up @@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
Expand Down Expand Up @@ -48,6 +49,11 @@ static cl::opt<int> HintThreshold(
"inlinehint-threshold", cl::Hidden, cl::init(325),
cl::desc("Threshold for inlining functions with inline hint"));

static cl::opt<int>
ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
cl::init(45),
cl::desc("Threshold for inlining cold callsites"));

// We introduce this threshold to help performance of instrumentation based
// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.
Expand All @@ -72,6 +78,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// Getter for the cache of @llvm.assume intrinsics.
std::function<AssumptionCache &(Function &)> &GetAssumptionCache;

/// Getter for BlockFrequencyInfo
Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;

/// Profile summary information.
ProfileSummaryInfo *PSI;

Expand Down Expand Up @@ -203,19 +212,21 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
public:
CallAnalyzer(const TargetTransformInfo &TTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
const InlineParams &Params)
: TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee),
CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
ExposesReturnsTwice(false), HasDynamicAlloca(false),
ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
NumVectorInstructions(0), FiftyPercentVectorBonus(0),
TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
SROACostSavings(0), SROACostSavingsLost(0) {}
: TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
PSI(PSI), F(Callee), CandidateCS(CSArg), Params(Params),
Threshold(Params.DefaultThreshold), Cost(0), IsCallerRecursive(false),
IsRecursiveCall(false), ExposesReturnsTwice(false),
HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
NumInstructionsSimplified(0), SROACostSavings(0),
SROACostSavingsLost(0) {}

bool analyzeCall(CallSite CS);

Expand Down Expand Up @@ -658,16 +669,21 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
if (Callee.hasFnAttribute(Attribute::InlineHint))
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
if (PSI) {
uint64_t TotalWeight;
if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
PSI->isHotCount(TotalWeight)) {
BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
if (PSI->isHotCallSite(CS, CallerBFI)) {
DEBUG(dbgs() << "Hot callsite.\n");
Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
} else if (PSI->isFunctionEntryHot(&Callee)) {
DEBUG(dbgs() << "Hot callee.\n");
// If callsite hotness can not be determined, we may still know
// that the callee is hot and treat it as a weaker hint for threshold
// increase.
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
} else if (PSI->isColdCallSite(CS, CallerBFI)) {
DEBUG(dbgs() << "Cold callsite.\n");
Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
} else if (PSI->isFunctionEntryCold(&Callee)) {
DEBUG(dbgs() << "Cold callee.\n");
Threshold = MinIfValid(Threshold, Params.ColdThreshold);
}
}
Expand Down Expand Up @@ -975,7 +991,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
// out. Pretend to inline the function, with a custom threshold.
auto IndirectCallParams = Params;
IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams);
CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
IndirectCallParams);
if (CA.analyzeCall(CS)) {
// We were able to inline the indirect call! Subtract the cost from the
// threshold to get the bonus we want to apply, but don't go below zero.
Expand Down Expand Up @@ -1465,15 +1482,17 @@ static bool functionsHaveCompatibleAttributes(Function *Caller,
InlineCost llvm::getInlineCost(
CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
ProfileSummaryInfo *PSI) {
return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
GetAssumptionCache, PSI);
GetAssumptionCache, GetBFI, PSI);
}

InlineCost llvm::getInlineCost(
CallSite CS, Function *Callee, const InlineParams &Params,
TargetTransformInfo &CalleeTTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
ProfileSummaryInfo *PSI) {

// Cannot inline indirect calls.
Expand Down Expand Up @@ -1508,7 +1527,8 @@ InlineCost llvm::getInlineCost(
DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
<< "...\n");

CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params);
CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
Params);
bool ShouldInline = CA.analyzeCall(CS);

DEBUG(CA.dump());
Expand Down Expand Up @@ -1581,6 +1601,9 @@ InlineParams llvm::getInlineParams(int Threshold) {
// Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
Params.HotCallSiteThreshold = HotCallSiteThreshold;

// Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
Params.ColdCallSiteThreshold = ColdCallSiteThreshold;

// Set the OptMinSizeThreshold and OptSizeThreshold params only if the
// Set the OptMinSizeThreshold and OptSizeThreshold params only if the
// -inlinehint-threshold commandline option is not explicitly given. If that
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/IPO/InlineSimple.cpp
Expand Up @@ -61,7 +61,8 @@ class SimpleInliner : public LegacyInlinerBase {
[&](Function &F) -> AssumptionCache & {
return ACT->getAssumptionCache(F);
};
return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI);
return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
/*GetBFI=*/None, PSI);
}

bool runOnSCC(CallGraphSCC &SCC) override;
Expand Down
18 changes: 13 additions & 5 deletions llvm/lib/Transforms/IPO/Inliner.cpp
Expand Up @@ -19,6 +19,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
Expand Down Expand Up @@ -765,15 +766,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
[&](Function &F) -> AssumptionCache & {
return FAM.getResult<AssumptionAnalysis>(F);
};

// Setup the data structure used to plumb customization into the
// `InlineFunction` routine.
InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache);
auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
return FAM.getResult<BlockFrequencyAnalysis>(F);
};

auto GetInlineCost = [&](CallSite CS) {
Function &Callee = *CS.getCalledFunction();
auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI);
return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI},
PSI);
};

// We use a worklist of nodes to process so that we can handle if the SCC
Expand Down Expand Up @@ -843,6 +844,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
if (!shouldInline(CS, GetInlineCost, ORE))
continue;

// Setup the data structure used to plumb customization into the
// `InlineFunction` routine.
InlineFunctionInfo IFI(
/*cg=*/nullptr, &GetAssumptionCache,
&FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
&FAM.getResult<BlockFrequencyAnalysis>(Callee));

if (!InlineFunction(CS, IFI))
continue;
DidInline = true;
Expand Down
72 changes: 68 additions & 4 deletions llvm/lib/Transforms/Utils/InlineFunction.cpp
Expand Up @@ -20,6 +20,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/EHPersonalities.h"
Expand All @@ -40,8 +41,8 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>

using namespace llvm;
Expand Down Expand Up @@ -1393,6 +1394,56 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
}
}
}
/// Update the block frequencies of the caller after a callee has been inlined.
///
/// Each block cloned into the caller has its block frequency scaled by the
/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
/// callee's entry block gets the same frequency as the callsite block and the
/// relative frequencies of all cloned blocks remain the same after cloning.
static void updateCallerBFI(BasicBlock *CallSiteBlock,
const ValueToValueMapTy &VMap,
BlockFrequencyInfo *CallerBFI,
BlockFrequencyInfo *CalleeBFI,
const BasicBlock &CalleeEntryBlock) {
SmallPtrSet<BasicBlock *, 16> ClonedBBs;
for (auto const &Entry : VMap) {
if (!isa<BasicBlock>(Entry.first) || !Entry.second)
continue;
auto *OrigBB = cast<BasicBlock>(Entry.first);
auto *ClonedBB = cast<BasicBlock>(Entry.second);
ClonedBBs.insert(ClonedBB);
CallerBFI->setBlockFreq(ClonedBB,
CalleeBFI->getBlockFreq(OrigBB).getFrequency());
}
BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
CallerBFI->setBlockFreqAndScale(
EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
ClonedBBs);
}

/// Update the entry count of callee after inlining.
///
/// The callsite's block count is subtracted from the callee's function entry
/// count.
static void updateCalleeCount(BlockFrequencyInfo &CallerBFI, BasicBlock *CallBB,
Function *Callee) {
// If the callee has a original count of N, and the estimated count of
// callsite is M, the new callee count is set to N - M. M is estimated from
// the caller's entry count, its entry block frequency and the block frequency
// of the callsite.
Optional<uint64_t> CalleeCount = Callee->getEntryCount();
if (!CalleeCount)
return;
Optional<uint64_t> CallSiteCount = CallerBFI.getBlockProfileCount(CallBB);
if (!CallSiteCount)
return;
// Since CallSiteCount is an estimate, it could exceed the original callee
// count and has to be set to 0.
if (CallSiteCount.getValue() > CalleeCount.getValue())
Callee->setEntryCount(0);
else
Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue());
}

/// This function inlines the called function into the basic block of the
/// caller. This returns false if it is not possible to inline this call.
Expand All @@ -1410,8 +1461,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,

// If IFI has any state in it, zap it before we fill it in.
IFI.reset();
const Function *CalledFunc = CS.getCalledFunction();

Function *CalledFunc = CS.getCalledFunction();
if (!CalledFunc || // Can't inline external function or indirect
CalledFunc->isDeclaration() || // call, or call to a vararg function!
CalledFunc->getFunctionType()->isVarArg()) return false;
Expand Down Expand Up @@ -1578,10 +1629,17 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
/*ModuleLevelChanges=*/false, Returns, ".i",
&InlinedFunctionInfo, TheCall);

// Remember the first block that is newly cloned over.
FirstNewBlock = LastBlock; ++FirstNewBlock;

if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr) {
// Update the BFI of blocks cloned into the caller.
updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
CalledFunc->front());
// Update the profile count of callee.
updateCalleeCount(*IFI.CallerBFI, OrigBB, CalledFunc);
}

// Inject byval arguments initialization.
for (std::pair<Value*, Value*> &Init : ByValInit)
HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
Expand Down Expand Up @@ -2087,6 +2145,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
CalledFunc->getName() + ".exit");
}

if (IFI.CallerBFI) {
// Copy original BB's block frequency to AfterCallBB
IFI.CallerBFI->setBlockFreq(
AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
}

// Change the branch that used to go to AfterCallBB to branch to the first
// basic block of the inlined function.
//
Expand Down

0 comments on commit 12585b0

Please sign in to comment.