118 changes: 112 additions & 6 deletions llvm/lib/Transforms/IPO/Inliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
Expand Down Expand Up @@ -47,10 +48,13 @@ STATISTIC(NumMergedAllocas, "Number of allocas merged together");
// if those would be more profitable and blocked inline steps.
STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");

Inliner::Inliner(char &ID) : CallGraphSCCPass(ID), InsertLifetime(true) {}
Inliner::Inliner(char &ID)
: CallGraphSCCPass(ID), InsertLifetime(true),
BFA(new BlockFrequencyAnalysis()) {}

Inliner::Inliner(char &ID, bool InsertLifetime)
: CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
: CallGraphSCCPass(ID), InsertLifetime(InsertLifetime),
BFA(new BlockFrequencyAnalysis()) {}

/// For this class, we declare that we require and preserve the call graph.
/// If the derived class implements this method, it should
Expand Down Expand Up @@ -259,7 +263,7 @@ bool Inliner::shouldInline(CallSite CS) {
Twine(IC.getCostDelta() + IC.getCost()) + ")");
return false;
}

// Try to detect the case where the current inlining candidate caller (call
// it B) is a static or linkonce-ODR function and is an inlining candidate
// elsewhere, and the current candidate callee (call it C) is large enough
Expand Down Expand Up @@ -356,8 +360,90 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
return false;
}

/// \brief Update the frequency of a block that is cloned into the caller.
/// This is invoked when \p OrigBB from the callee is cloned into \p NewBB in
/// the caller.
void Inliner::updateBlockFreq(CallSite &CS, const BasicBlock *OrigBB,
const BasicBlock *NewBB) {
if (!HasProfileData)
return;
Instruction *Call = CS.getInstruction();
BasicBlock *CallBB = Call->getParent();
BlockFrequencyInfo *CalleeBFI =
BFA->getBlockFrequencyInfo(CS.getCalledFunction());
BlockFrequencyInfo *CallerBFI =
BFA->getBlockFrequencyInfo(CallBB->getParent());
// Find the number of times OrigBB is executed per invocation of the callee
// and multiply by the number of times callee is executed in the caller.
// Freq(NewBB) = Freq(OrigBB) * CallSiteFreq / CalleeEntryFreq.
uint64_t CallSiteFreq = CallerBFI->getBlockFreq(CallBB).getFrequency();
uint64_t CalleeEntryFreq = CalleeBFI->getEntryFreq();
// Frequency of OrigBB in the callee.
BlockFrequency OrigBBFreq = CalleeBFI->getBlockFreq(OrigBB);
CallerBFI->setBlockFreq(NewBB, (double)(OrigBBFreq.getFrequency()) /
CalleeEntryFreq * CallSiteFreq);
}

/// \brief Update entry count of \p Callee after it got inlined at a callsite
/// in block \p CallBB.
void Inliner::updateEntryCount(BasicBlock *CallBB, Function *Callee) {
if (!HasProfileData)
return;
// If the callee has a original count of N, and the estimated count of
// callsite is M, the new callee count is set to N - M. M is estimated from
// the caller's entry count, its entry block frequency and the block frequency
// of the callsite.
Optional<uint64_t> CalleeCount = Callee->getEntryCount();
if (!CalleeCount)
return;
Optional<uint64_t> CallSiteCount = llvm::getBlockCount(CallBB, BFA.get());
if (!CallSiteCount)
return;
// Since CallSiteCount is an estimate, it could exceed the original callee
// count and has to be set to 0.
if (CallSiteCount.getValue() > CalleeCount.getValue()) {
Callee->setEntryCount(0);
DEBUG(llvm::dbgs() << "Estimated count of block " << CallBB->getName()
<< " is " << CallSiteCount.getValue()
<< " which exceeds the entry count "
<< CalleeCount.getValue() << " of the callee "
<< Callee->getName() << "\n");
} else
Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue());
}

void Inliner::invalidateBFI(Function *F) {
if (!HasProfileData)
return;
if (F)
BFA->invalidateBlockFrequencyInfo(F);
}
void Inliner::invalidateBFI(CallGraphSCC &SCC) {
if (!HasProfileData)
return;
for (CallGraphNode *Node : SCC) {
Function *F = Node->getFunction();
invalidateBFI(F);
}
}
void Inliner::copyBlockFrequency(BasicBlock *Src, BasicBlock *Dst) {
if (!HasProfileData)
return;
Function *F = Src->getParent();
BlockFrequencyInfo *BFI = BFA->getBlockFrequencyInfo(F);
BFI->setBlockFreq(Dst, BFI->getBlockFreq(Src).getFrequency());
}

static bool hasProfileData(Module &M) {
// We check for the presence of MaxFunctionCount in the module.
// FIXME: This now only works for frontend based instrumentation.
return M.getMaximumFunctionCount().hasValue();
}

bool Inliner::runOnSCC(CallGraphSCC &SCC) {
using namespace std::placeholders;
CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
HasProfileData = hasProfileData(CG.getModule());
ACT = &getAnalysis<AssumptionCacheTracker>();
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();

Expand Down Expand Up @@ -419,7 +505,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {


InlinedArrayAllocasTy InlinedArrayAllocas;
InlineFunctionInfo InlineInfo(&CG, ACT);

// Now that we have all of the call sites, loop over them and inline them if
// it looks profitable to do so.
Expand Down Expand Up @@ -448,6 +533,10 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
CS.getInstruction()->eraseFromParent();
++NumCallsDeleted;
} else {
Instruction *TheCall = CS.getInstruction();
BasicBlock *CallSiteBlock = TheCall->getParent();
Instruction *CallSuccessor = &*(++BasicBlock::iterator(TheCall));

// We can only inline direct calls to non-declarations.
if (!Callee || Callee->isDeclaration()) continue;

Expand Down Expand Up @@ -476,6 +565,11 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
continue;
}

BlockCloningFunctor BCF = nullptr;
if (HasProfileData)
BCF = std::bind(&Inliner::updateBlockFreq, this, CS, _1, _2);
InlineFunctionInfo InlineInfo(&CG, ACT, BCF);

// Attempt to inline the function.
if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas,
InlineHistoryID, InsertLifetime)) {
Expand All @@ -485,6 +579,13 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
Caller->getName()));
continue;
}
updateEntryCount(CallSiteBlock, Callee);
// The instruction following the call is part of a new basic block
// created during the inlining process. This does not have an entry in
// the BFI. We create an entry by copying the frequency of the original
// block containing the call.
copyBlockFrequency(CallSiteBlock, CallSuccessor->getParent());

++NumInlined;

// Report the inline decision.
Expand Down Expand Up @@ -523,7 +624,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
CalleeNode->removeAllCalledFunctions();

// Removing the node for callee from the call graph and delete it.
delete CG.removeFunctionFromModule(CalleeNode);
Function *F = CG.removeFunctionFromModule(CalleeNode);
invalidateBFI(F);
delete F;
++NumDeleted;
}

Expand All @@ -544,6 +647,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
}
} while (LocalChange);

invalidateBFI(SCC);
return Changed;
}

Expand Down Expand Up @@ -651,7 +755,9 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
FunctionsToRemove.end()),
FunctionsToRemove.end());
for (CallGraphNode *CGN : FunctionsToRemove) {
delete CG.removeFunctionFromModule(CGN);
Function *F = CG.removeFunctionFromModule(CGN);
invalidateBFI(F);
delete F;
++NumDeleted;
}
return true;
Expand Down
42 changes: 22 additions & 20 deletions llvm/lib/Transforms/Utils/CloneFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,17 +277,19 @@ namespace {

/// The specified block is found to be reachable, clone it and
/// anything that it can reach.
void CloneBlock(const BasicBlock *BB,
void CloneBlock(const BasicBlock *BB,
BasicBlock::const_iterator StartingInst,
std::vector<const BasicBlock*> &ToClone);
std::vector<const BasicBlock *> &ToClone,
BlockCloningFunctor Ftor = nullptr);
};
}

/// The specified block is found to be reachable, clone it and
/// anything that it can reach.
void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
BasicBlock::const_iterator StartingInst,
std::vector<const BasicBlock*> &ToClone){
std::vector<const BasicBlock *> &ToClone,
BlockCloningFunctor Ftor) {
WeakVH &BBEntry = VMap[BB];

// Have we already cloned this block?
Expand Down Expand Up @@ -424,18 +426,19 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
BB != &BB->getParent()->front();
}
// Call Ftor to tell BB has been cloned to NewBB
if (Ftor)
Ftor(BB, NewBB);
}

/// This works like CloneAndPruneFunctionInto, except that it does not clone the
/// entire function. Instead it starts at an instruction provided by the caller
/// and copies (and prunes) only the code reachable from that instruction.
void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
const Instruction *StartingInst,
ValueToValueMapTy &VMap,
bool ModuleLevelChanges,
SmallVectorImpl<ReturnInst *> &Returns,
const char *NameSuffix,
ClonedCodeInfo *CodeInfo) {
void llvm::CloneAndPruneIntoFromInst(
Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst,
ValueToValueMapTy &VMap, bool ModuleLevelChanges,
SmallVectorImpl<ReturnInst *> &Returns, const char *NameSuffix,
ClonedCodeInfo *CodeInfo, BlockCloningFunctor Ftor) {
assert(NameSuffix && "NameSuffix cannot be null!");

ValueMapTypeRemapper *TypeMapper = nullptr;
Expand All @@ -461,11 +464,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,

// Clone the entry block, and anything recursively reachable from it.
std::vector<const BasicBlock*> CloneWorklist;
PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist, Ftor);
while (!CloneWorklist.empty()) {
const BasicBlock *BB = CloneWorklist.back();
CloneWorklist.pop_back();
PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
PFC.CloneBlock(BB, BB->begin(), CloneWorklist, Ftor);
}

// Loop over all of the basic blocks in the old function. If the block was
Expand Down Expand Up @@ -667,15 +670,14 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
/// constant arguments cause a significant amount of code in the callee to be
/// dead. Since this doesn't produce an exact copy of the input, it can't be
/// used for things like CloneFunction or CloneModule.
void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
ValueToValueMapTy &VMap,
bool ModuleLevelChanges,
SmallVectorImpl<ReturnInst*> &Returns,
const char *NameSuffix,
ClonedCodeInfo *CodeInfo,
Instruction *TheCall) {
void llvm::CloneAndPruneFunctionInto(
Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap,
bool ModuleLevelChanges, SmallVectorImpl<ReturnInst *> &Returns,
const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall,
BlockCloningFunctor Ftor) {
CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
ModuleLevelChanges, Returns, NameSuffix, CodeInfo,
Ftor);
}

/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Utils/InlineFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,

// If IFI has any state in it, zap it before we fill it in.
IFI.reset();

const Function *CalledFunc = CS.getCalledFunction();
if (!CalledFunc || // Can't inline external function or indirect
CalledFunc->isDeclaration() || // call, or call to a vararg function!
Expand Down Expand Up @@ -1486,7 +1486,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
// happy with whatever the cloner can do.
CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
/*ModuleLevelChanges=*/false, Returns, ".i",
&InlinedFunctionInfo, TheCall);
&InlinedFunctionInfo, TheCall, IFI.Ftor);

// Remember the first block that is newly cloned over.
FirstNewBlock = LastBlock; ++FirstNewBlock;
Expand Down
27 changes: 27 additions & 0 deletions llvm/test/Transforms/Inline/function-count-update-2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
; RUN: opt < %s -inline -S | FileCheck %s

; This tests that the function count of a callee gets correctly updated after it
; has been inlined into a two callsites.

; CHECK: @callee() !prof [[COUNT:![0-9]+]]
define i32 @callee() !prof !1 {
ret i32 0
}

define i32 @caller1() !prof !2 {
%i = call i32 @callee()
ret i32 %i
}

define i32 @caller2() !prof !3 {
%i = call i32 @callee()
ret i32 %i
}

!llvm.module.flags = !{!0}
; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0}
!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
!1 = !{!"function_entry_count", i64 1000}
!2 = !{!"function_entry_count", i64 600}
!3 = !{!"function_entry_count", i64 400}

69 changes: 69 additions & 0 deletions llvm/test/Transforms/Inline/function-count-update-3.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
; RUN: opt < %s -inline -S -inline-threshold=50 | FileCheck %s

; This tests that the function count of a function gets properly scaled after
; inlining a call chain leading to the function.
; Function a calls c with count 200 (C1)
; Function b calls c with count 300
; Function c calls e with count 250 (C2)
; Entry count of e is 500 (C3)
; c->e inlining does not happen since the cost exceeds threshold.
; c then inlined into a.
; e now gets inlined into a (through c) since the branch condition in e is now
; known and hence the cost gets reduced.
; Estimated count of a->e callsite = C2 * (C1 / C3)
; Estimated count of a->e callsite = 250 * (200 / 500) = 100
; Remaining count of e = C3 - 100 = 500 - 100 = 400

@data = external global i32

define i32 @a(i32 %a1) !prof !1 {
%a2 = call i32 @c(i32 %a1, i32 1)
ret i32 %a2
}

define i32 @b(i32 %b1) !prof !2 {
%b2 = call i32 @c(i32 %b1, i32 %b1)
ret i32 %b2
}

define i32 @c(i32 %c1, i32 %c100) !prof !3 {
%cond = icmp sle i32 %c1, 1
br i1 %cond, label %cond_true, label %cond_false

cond_false:
ret i32 0

cond_true:
%c11 = call i32 @e(i32 %c100)
ret i32 %c11
}

; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]]
define i32 @e(i32 %c1) !prof !4 {
%cond = icmp sle i32 %c1, 1
br i1 %cond, label %cond_true, label %cond_false

cond_false:
%c2 = load i32, i32* @data, align 4
%c3 = add i32 %c1, %c2
%c4 = mul i32 %c3, %c2
%c5 = add i32 %c4, %c2
%c6 = mul i32 %c5, %c2
%c7 = add i32 %c6, %c2
%c8 = mul i32 %c7, %c2
%c9 = add i32 %c8, %c2
%c10 = mul i32 %c9, %c2
ret i32 %c10

cond_true:
ret i32 0
}

!llvm.module.flags = !{!0}
; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400}
!0 = !{i32 1, !"MaxFunctionCount", i32 5000}
!1 = !{!"function_entry_count", i64 200}
!2 = !{!"function_entry_count", i64 300}
!3 = !{!"function_entry_count", i64 500}
!4 = !{!"function_entry_count", i64 500}

51 changes: 51 additions & 0 deletions llvm/test/Transforms/Inline/function-count-update.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
; RUN: opt < %s -inline -S | FileCheck %s
; RUN: opt < %s -always-inline -S | FileCheck %s

; This tests that the function count of two callees get correctly updated after
; they have been inlined into two back-to-back callsites in a single basic block
; in the caller. The callees have the alwaysinline attribute and so they get
; inlined both with the regular inliner pass and the always inline pass. In
; both cases, the new count of each callee is the original count minus callsite
; count which is 200 (since the caller's entry count is 400 and the block
; containing the calls have a relative block frequency of 0.5).

; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]]
define i32 @callee1(i32 %n) #0 !prof !1 {
%cond = icmp sle i32 %n, 10
br i1 %cond, label %cond_true, label %cond_false

cond_true:
%r1 = add i32 %n, 1
ret i32 %r1
cond_false:
%r2 = add i32 %n, 2
ret i32 %r2
}

; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]]
define i32 @callee2(i32 %n) #0 !prof !2 {
%r1 = add i32 %n, 1
ret i32 %r1
}

define i32 @caller(i32 %n) !prof !3 {
%cond = icmp sle i32 %n, 100
br i1 %cond, label %cond_true, label %cond_false

cond_true:
%i = call i32 @callee1(i32 %n)
%j = call i32 @callee2(i32 %i)
ret i32 %j
cond_false:
ret i32 0
}

!llvm.module.flags = !{!0}
; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800}
; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800}
!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
!1 = !{!"function_entry_count", i64 1000}
!2 = !{!"function_entry_count", i64 2000}
!3 = !{!"function_entry_count", i64 400}
attributes #0 = { alwaysinline }