From 7e86b13c63f200a5649234647433fc563e1159f5 Mon Sep 17 00:00:00 2001 From: wlei Date: Mon, 27 Jun 2022 23:00:05 -0700 Subject: [PATCH] [CSSPGO][llvm-profgen] Reimplement SampleContextTracker using context trie This is the followup patch to https://reviews.llvm.org/D125246 for the `SampleContextTracker` part. Before the promotion and merging of the context is based on the SampleContext(the array of frame), this causes a lot of cost to the memory. This patch detaches the tracker from using the array ref instead to use the context trie itself. This can save a lot of memory usage and benefit both the compiler's CS inliner and llvm-profgen's pre-inliner. One structure needs to be specially treated is the `FuncToCtxtProfiles`, this is used to get all the functionSamples for one function to do the merging and promoting. Before it search each functions' context and traverse the trie to get the node of the context. Now we don't have the context inside the profile, instead we directly use an auxiliary map `ProfileToNodeMap` for profile , it initialize to create the FunctionSamples to TrieNode relations and keep updating it during promoting and merging the node. Moreover, I was expecting the results before and after remain the same, but I found that the order of FuncToCtxtProfiles matter and affect the results. This can happen on recursive context case, but the difference should be small. Now we don't have the context, so I just used a vector for the order, the result is still deterministic. Measured on one huge size(12GB) profile from one of our internal service. The profile similarity difference is 99.999%, and the running time is improved by 3X(debug mode) and the memory is reduced from 170GB to 90GB. Reviewed By: hoy, wenlei Differential Revision: https://reviews.llvm.org/D127031 --- llvm/include/llvm/ProfileData/SampleProf.h | 10 -- .../Transforms/IPO/SampleContextTracker.h | 94 ++++++++++---- .../Transforms/IPO/SampleContextTracker.cpp | 122 +++++++++++------- llvm/lib/Transforms/IPO/SampleProfile.cpp | 3 +- llvm/tools/llvm-profgen/CSPreInliner.cpp | 83 ++++++------ llvm/tools/llvm-profgen/CSPreInliner.h | 7 +- llvm/tools/llvm-profgen/ProfileGenerator.cpp | 28 +--- llvm/tools/llvm-profgen/ProfiledBinary.cpp | 16 +-- llvm/tools/llvm-profgen/ProfiledBinary.h | 6 +- 9 files changed, 204 insertions(+), 165 deletions(-) diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 8d8813ecf8c13..f11392c053183 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -553,16 +553,6 @@ class SampleContext { } } - // Promote context by removing top frames with the length of - // `ContextFramesToRemove`. Note that with array representation of context, - // the promotion is effectively a slice operation with first - // `ContextFramesToRemove` elements removed from left. - void promoteOnPath(uint32_t ContextFramesToRemove) { - assert(ContextFramesToRemove <= FullContext.size() && - "Cannot remove more than the whole context"); - FullContext = FullContext.drop_front(ContextFramesToRemove); - } - // Decode context string for a frame to get function name and location. // `ContextStr` is in the form of `FuncName:StartLine.Discriminator`. static void decodeContextString(StringRef ContextStr, StringRef &FName, diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h index 3bc417c9d2d1b..a97d5ee3d7101 100644 --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -19,6 +19,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ProfileData/SampleProf.h" #include +#include #include namespace llvm { @@ -44,11 +45,6 @@ class ContextTrieNode { ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite, StringRef ChildName, bool AllowCreate = true); - - ContextTrieNode &moveToChildContext(const LineLocation &CallSite, - ContextTrieNode &&NodeToMove, - uint32_t ContextFramesToRemove, - bool DeleteNode = true); void removeChildContext(const LineLocation &CallSite, StringRef ChildName); std::map &getAllChildContext(); StringRef getFuncName() const; @@ -59,6 +55,7 @@ class ContextTrieNode { LineLocation getCallSiteLoc() const; ContextTrieNode *getParentContext() const; void setParentContext(ContextTrieNode *Parent); + void setCallSiteLoc(const LineLocation &Loc); void dumpNode(); void dumpTree(); @@ -91,23 +88,13 @@ class ContextTrieNode { // calling context and the context is identified by path from root to the node. class SampleContextTracker { public: - struct ProfileComparer { - bool operator()(FunctionSamples *A, FunctionSamples *B) const { - // Sort function profiles by the number of total samples and their - // contexts. - if (A->getTotalSamples() == B->getTotalSamples()) - return A->getContext() < B->getContext(); - return A->getTotalSamples() > B->getTotalSamples(); - } - }; - - // Keep profiles of a function sorted so that they will be processed/promoted - // deterministically. - using ContextSamplesTy = std::set; + using ContextSamplesTy = std::vector; SampleContextTracker() = default; SampleContextTracker(SampleProfileMap &Profiles, const DenseMap *GUIDToFuncNameMap); + // Populate the FuncToCtxtProfiles map after the trie is built. + void populateFuncToCtxtMap(); // Query context profile for a specific callee with given name at a given // call-site. The full context is identified by location of call instruction. FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst, @@ -145,6 +132,61 @@ class SampleContextTracker { // Create a merged conext-less profile map. void createContextLessProfileMap(SampleProfileMap &ContextLessProfiles); + ContextTrieNode * + getContextNodeForProfile(const FunctionSamples *FSamples) const { + auto I = ProfileToNodeMap.find(FSamples); + if (I == ProfileToNodeMap.end()) + return nullptr; + return I->second; + } + StringMap &getFuncToCtxtProfiles() { + return FuncToCtxtProfiles; + } + + class Iterator : public std::iterator { + std::queue NodeQueue; + + public: + explicit Iterator() = default; + explicit Iterator(ContextTrieNode *Node) { NodeQueue.push(Node); } + Iterator &operator++() { + assert(!NodeQueue.empty() && "Iterator already at the end"); + ContextTrieNode *Node = NodeQueue.front(); + NodeQueue.pop(); + for (auto &It : Node->getAllChildContext()) + NodeQueue.push(&It.second); + return *this; + } + + Iterator operator++(int) { + assert(!NodeQueue.empty() && "Iterator already at the end"); + Iterator Ret = *this; + ++(*this); + return Ret; + } + bool operator==(const Iterator &Other) const { + if (NodeQueue.empty() && Other.NodeQueue.empty()) + return true; + if (NodeQueue.empty() || Other.NodeQueue.empty()) + return false; + return NodeQueue.front() == Other.NodeQueue.front(); + } + bool operator!=(const Iterator &Other) const { return !(*this == Other); } + ContextTrieNode *operator*() const { + assert(!NodeQueue.empty() && "Invalid access to end iterator"); + return NodeQueue.front(); + } + }; + + Iterator begin() { return Iterator(&RootContext); } + Iterator end() { return Iterator(); } + +#ifndef NDEBUG + // Get a context string from root to current node. + std::string getContextString(const FunctionSamples &FSamples) const; + std::string getContextString(ContextTrieNode *Node) const; +#endif // Dump the internal context profile trie. void dump(); @@ -155,15 +197,23 @@ class SampleContextTracker { ContextTrieNode *getTopLevelContextNode(StringRef FName); ContextTrieNode &addTopLevelContextNode(StringRef FName); ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo); - void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode, - uint32_t ContextFramesToRemove); + void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode); ContextTrieNode & promoteMergeContextSamplesTree(ContextTrieNode &FromNode, - ContextTrieNode &ToNodeParent, - uint32_t ContextFramesToRemove); + ContextTrieNode &ToNodeParent); + ContextTrieNode &moveContextSamples(ContextTrieNode &ToNodeParent, + const LineLocation &CallSite, + ContextTrieNode &&NodeToMove); + void setContextNode(const FunctionSamples *FSample, ContextTrieNode *Node) { + ProfileToNodeMap[FSample] = Node; + } // Map from function name to context profiles (excluding base profile) StringMap FuncToCtxtProfiles; + // Map from current FunctionSample to the belonged context trie. + std::unordered_map + ProfileToNodeMap; + // Map from function guid to real function names. Only used in md5 mode. const DenseMap *GUIDToFuncNameMap; diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index 2471f3271308a..52cdaa9f5ca0a 100644 --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -63,23 +63,24 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { return ChildNodeRet; } -ContextTrieNode &ContextTrieNode::moveToChildContext( - const LineLocation &CallSite, ContextTrieNode &&NodeToMove, - uint32_t ContextFramesToRemove, bool DeleteNode) { +ContextTrieNode & +SampleContextTracker::moveContextSamples(ContextTrieNode &ToNodeParent, + const LineLocation &CallSite, + ContextTrieNode &&NodeToMove) { uint64_t Hash = FunctionSamples::getCallSiteHash(NodeToMove.getFuncName(), CallSite); + std::map &AllChildContext = + ToNodeParent.getAllChildContext(); assert(!AllChildContext.count(Hash) && "Node to remove must exist"); - LineLocation OldCallSite = NodeToMove.CallSiteLoc; - ContextTrieNode &OldParentContext = *NodeToMove.getParentContext(); AllChildContext[Hash] = NodeToMove; ContextTrieNode &NewNode = AllChildContext[Hash]; - NewNode.CallSiteLoc = CallSite; + NewNode.setCallSiteLoc(CallSite); // Walk through nodes in the moved the subtree, and update // FunctionSamples' context as for the context promotion. // We also need to set new parant link for all children. std::queue NodeToUpdate; - NewNode.setParentContext(this); + NewNode.setParentContext(&ToNodeParent); NodeToUpdate.push(&NewNode); while (!NodeToUpdate.empty()) { @@ -88,10 +89,8 @@ ContextTrieNode &ContextTrieNode::moveToChildContext( FunctionSamples *FSamples = Node->getFunctionSamples(); if (FSamples) { - FSamples->getContext().promoteOnPath(ContextFramesToRemove); + setContextNode(FSamples, Node); FSamples->getContext().setState(SyntheticContext); - LLVM_DEBUG(dbgs() << " Context promoted to: " - << FSamples->getContext().toString() << "\n"); } for (auto &It : Node->getAllChildContext()) { @@ -101,10 +100,6 @@ ContextTrieNode &ContextTrieNode::moveToChildContext( } } - // Original context no longer needed, destroy if requested. - if (DeleteNode) - OldParentContext.removeChildContext(OldCallSite, NewNode.getFuncName()); - return NewNode; } @@ -148,6 +143,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) { ParentContext = Parent; } +void ContextTrieNode::setCallSiteLoc(const LineLocation &Loc) { + CallSiteLoc = Loc; +} + void ContextTrieNode::dumpNode() { dbgs() << "Node: " << FuncName << "\n" << " Callsite: " << CallSiteLoc << "\n" @@ -203,13 +202,23 @@ SampleContextTracker::SampleContextTracker( SampleContext Context = FuncSample.first; LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context.toString() << "\n"); - if (!Context.isBaseContext()) - FuncToCtxtProfiles[Context.getName()].insert(FSamples); ContextTrieNode *NewNode = getOrCreateContextPath(Context, true); assert(!NewNode->getFunctionSamples() && "New node can't have sample profile"); NewNode->setFunctionSamples(FSamples); } + populateFuncToCtxtMap(); +} + +void SampleContextTracker::populateFuncToCtxtMap() { + for (auto *Node : *this) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + FSamples->getContext().setState(RawContext); + setContextNode(FSamples, Node); + FuncToCtxtProfiles[Node->getFuncName()].push_back(FSamples); + } + } } FunctionSamples * @@ -232,7 +241,7 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, if (CalleeContext) { FunctionSamples *FSamples = CalleeContext->getFunctionSamples(); LLVM_DEBUG(if (FSamples) { - dbgs() << " Callee context found: " << FSamples->getContext().toString() + dbgs() << " Callee context found: " << getContextString(CalleeContext) << "\n"; }); return FSamples; @@ -334,7 +343,7 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name, if (Context.hasState(InlinedContext) || Context.hasState(MergedContext)) continue; - ContextTrieNode *FromNode = getContextFor(Context); + ContextTrieNode *FromNode = getContextNodeForProfile(CSamples); if (FromNode == Node) continue; @@ -355,7 +364,7 @@ void SampleContextTracker::markContextSamplesInlined( const FunctionSamples *InlinedSamples) { assert(InlinedSamples && "Expect non-null inlined samples"); LLVM_DEBUG(dbgs() << "Marking context profile as inlined: " - << InlinedSamples->getContext().toString() << "\n"); + << getContextString(*InlinedSamples) << "\n"); InlinedSamples->getContext().setState(InlinedContext); } @@ -407,15 +416,39 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples(); assert(FromSamples && "Shouldn't promote a context without profile"); LLVM_DEBUG(dbgs() << " Found context tree root to promote: " - << FromSamples->getContext().toString() << "\n"); + << getContextString(&NodeToPromo) << "\n"); assert(!FromSamples->getContext().hasState(InlinedContext) && "Shouldn't promote inlined context profile"); - uint32_t ContextFramesToRemove = - FromSamples->getContext().getContextFrames().size() - 1; - return promoteMergeContextSamplesTree(NodeToPromo, RootContext, - ContextFramesToRemove); + return promoteMergeContextSamplesTree(NodeToPromo, RootContext); +} + +#ifndef NDEBUG +std::string +SampleContextTracker::getContextString(const FunctionSamples &FSamples) const { + return getContextString(getContextNodeForProfile(&FSamples)); +} + +std::string +SampleContextTracker::getContextString(ContextTrieNode *Node) const { + SampleContextFrameVector Res; + if (Node == &RootContext) + return std::string(); + Res.emplace_back(Node->getFuncName(), LineLocation(0, 0)); + + ContextTrieNode *PreNode = Node; + Node = Node->getParentContext(); + while (Node && Node != &RootContext) { + Res.emplace_back(Node->getFuncName(), PreNode->getCallSiteLoc()); + PreNode = Node; + Node = Node->getParentContext(); + } + + std::reverse(Res.begin(), Res.end()); + + return SampleContext::getContextString(Res); } +#endif void SampleContextTracker::dump() { RootContext.dumpTree(); } @@ -527,8 +560,7 @@ ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) { } void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode, - ContextTrieNode &ToNode, - uint32_t ContextFramesToRemove) { + ContextTrieNode &ToNode) { FunctionSamples *FromSamples = FromNode.getFunctionSamples(); FunctionSamples *ToSamples = ToNode.getFunctionSamples(); if (FromSamples && ToSamples) { @@ -541,16 +573,13 @@ void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode, } else if (FromSamples) { // Transfer FromSamples from FromNode to ToNode ToNode.setFunctionSamples(FromSamples); + setContextNode(FromSamples, &ToNode); FromSamples->getContext().setState(SyntheticContext); - FromSamples->getContext().promoteOnPath(ContextFramesToRemove); - FromNode.setFunctionSamples(nullptr); } } ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( - ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent, - uint32_t ContextFramesToRemove) { - assert(ContextFramesToRemove && "Context to remove can't be empty"); + ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent) { // Ignore call site location if destination is top level under root LineLocation NewCallSiteLoc = LineLocation(0, 0); @@ -567,22 +596,25 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( if (!ToNode) { // Do not delete node to move from its parent here because // caller is iterating over children of that parent node. - ToNode = &ToNodeParent.moveToChildContext( - NewCallSiteLoc, std::move(FromNode), ContextFramesToRemove, false); + ToNode = + &moveContextSamples(ToNodeParent, NewCallSiteLoc, std::move(FromNode)); + LLVM_DEBUG({ + dbgs() << " Context promoted and merged to: " << getContextString(ToNode) + << "\n"; + }); } else { // Destination node exists, merge samples for the context tree - mergeContextNode(FromNode, *ToNode, ContextFramesToRemove); + mergeContextNode(FromNode, *ToNode); LLVM_DEBUG({ if (ToNode->getFunctionSamples()) dbgs() << " Context promoted and merged to: " - << ToNode->getFunctionSamples()->getContext().toString() << "\n"; + << getContextString(ToNode) << "\n"; }); // Recursively promote and merge children for (auto &It : FromNode.getAllChildContext()) { ContextTrieNode &FromChildNode = It.second; - promoteMergeContextSamplesTree(FromChildNode, *ToNode, - ContextFramesToRemove); + promoteMergeContextSamplesTree(FromChildNode, *ToNode); } // Remove children once they're all merged @@ -598,21 +630,11 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree( void SampleContextTracker::createContextLessProfileMap( SampleProfileMap &ContextLessProfiles) { - std::queue NodeQueue; - NodeQueue.push(&RootContext); - - while (!NodeQueue.empty()) { - ContextTrieNode *Node = NodeQueue.front(); + for (auto *Node : *this) { FunctionSamples *FProfile = Node->getFunctionSamples(); - NodeQueue.pop(); - - if (FProfile) { - // Profile's context can be empty, use ContextNode's func name. + // Profile's context can be empty, use ContextNode's func name. + if (FProfile) ContextLessProfiles[Node->getFuncName()].merge(*FProfile); - } - - for (auto &It : Node->getAllChildContext()) - NodeQueue.push(&It.second); } } } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 8e4534723517c..40de69bbf2cf4 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1073,8 +1073,7 @@ void SampleProfileLoader::findExternalInlineCandidate( return; } - ContextTrieNode *Caller = - ContextTracker->getContextFor(Samples->getContext()); + ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples); std::queue CalleeList; CalleeList.push(Caller); while (!CalleeList.empty()) { diff --git a/llvm/tools/llvm-profgen/CSPreInliner.cpp b/llvm/tools/llvm-profgen/CSPreInliner.cpp index 5c7c3d5269d35..55d76c26db606 100644 --- a/llvm/tools/llvm-profgen/CSPreInliner.cpp +++ b/llvm/tools/llvm-profgen/CSPreInliner.cpp @@ -55,14 +55,13 @@ static cl::opt SamplePreInlineReplay( cl::desc( "Replay previous inlining and adjust context profile accordingly")); -CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, - ProfileSummary *Summary) +CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, + ProfiledBinary &Binary, ProfileSummary *Summary) : UseContextCost(UseContextCostForPreInliner), // TODO: Pass in a guid-to-name map in order for // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes // as their profile context. - ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary), - Summary(Summary) { + ContextTracker(Tracker), Binary(Binary), Summary(Summary) { // Set default preinliner hot/cold call site threshold tuned with CSSPGO. // for good performance with reasonable profile size. if (!SampleHotCallSiteThreshold.getNumOccurrences()) @@ -107,7 +106,7 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, // current one in the trie is relavent. So we walk the trie instead of call // targets from function profile. ContextTrieNode *CallerNode = - ContextTracker.getContextFor(CallerSamples->getContext()); + ContextTracker.getContextNodeForProfile(CallerSamples); bool HasNewCandidate = false; for (auto &Child : CallerNode->getAllChildContext()) { @@ -131,7 +130,7 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, // TODO: call site and callee entry count should be mostly consistent, add // check for that. HasNewCandidate = true; - uint32_t CalleeSize = getFuncSize(*CalleeSamples); + uint32_t CalleeSize = getFuncSize(CalleeNode); CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), CalleeSize); } @@ -139,12 +138,11 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, return HasNewCandidate; } -uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) { - if (UseContextCost) { - return Binary.getFuncSizeForContext(FSamples.getContext()); - } +uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { + if (UseContextCost) + return Binary.getFuncSizeForContext(ContextNode); - return FSamples.getBodySamples().size(); + return ContextNode->getFunctionSamples()->getBodySamples().size(); } bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { @@ -189,7 +187,8 @@ void CSPreInliner::processFunction(const StringRef Name) { if (!FSamples) return; - unsigned FuncSize = getFuncSize(*FSamples); + unsigned FuncSize = + getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); unsigned FuncFinalSize = FuncSize; unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); @@ -218,11 +217,12 @@ void CSPreInliner::processFunction(const StringRef Name) { } else { ++PreInlNumCSNotInlined; } - LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined") - << " context profile for: " - << Candidate.CalleeSamples->getContext().toString() - << " (callee size: " << Candidate.SizeCost - << ", call count:" << Candidate.CallsiteCount << ")\n"); + LLVM_DEBUG( + dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); } if (!CQueue.empty()) { @@ -246,7 +246,8 @@ void CSPreInliner::processFunction(const StringRef Name) { CQueue.pop(); bool WasInlined = Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); - dbgs() << " " << Candidate.CalleeSamples->getContext().toString() + dbgs() << " " + << ContextTracker.getContextString(*Candidate.CalleeSamples) << " (candidate size:" << Candidate.SizeCost << ", call count: " << Candidate.CallsiteCount << ", previously " << (WasInlined ? "inlined)\n" : "not inlined)\n"); @@ -256,19 +257,24 @@ void CSPreInliner::processFunction(const StringRef Name) { void CSPreInliner::run() { #ifndef NDEBUG - auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) { - dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" - << Profiles.size() << " total):\n"; - for (auto &It : Profiles) { - const FunctionSamples &Samples = It.second; - dbgs() << " [" << Samples.getContext().toString() << "] " - << Samples.getTotalSamples() << ":" << Samples.getHeadSamples() - << "\n"; + auto printProfileNames = [](SampleContextTracker &ContextTracker, + bool IsInput) { + uint32_t Size = 0; + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + Size++; + dbgs() << " [" << ContextTracker.getContextString(Node) << "] " + << FSamples->getTotalSamples() << ":" + << FSamples->getHeadSamples() << "\n"; + } } + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Size << " total):\n"; }; #endif - LLVM_DEBUG(printProfileNames(ProfileMap, true)); + LLVM_DEBUG(printProfileNames(ContextTracker, true)); // Execute global pre-inliner to estimate a global top-down inline // decision and merge profiles accordingly. This helps with profile @@ -283,24 +289,15 @@ void CSPreInliner::run() { // Not inlined context profiles are merged into its base, so we can // trim out such profiles from the output. - std::vector ProfilesToBeRemoved; - for (auto &It : ProfileMap) { - SampleContext &Context = It.second.getContext(); - if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) { - assert(Context.hasState(MergedContext) && - "Not inlined context profile should be merged already"); - ProfilesToBeRemoved.push_back(It.first); + for (auto *Node : ContextTracker) { + FunctionSamples *FProfile = Node->getFunctionSamples(); + if (FProfile && + (Node->getParentContext() != &ContextTracker.getRootContext() && + !FProfile->getContext().hasState(InlinedContext))) { + Node->setFunctionSamples(nullptr); } } - - for (auto &ContextName : ProfilesToBeRemoved) { - ProfileMap.erase(ContextName); - } - - // Make sure ProfileMap's key is consistent with FunctionSamples' name. - SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles(); - FunctionSamples::ProfileIsPreInlined = true; - LLVM_DEBUG(printProfileNames(ProfileMap, false)); + LLVM_DEBUG(printProfileNames(ContextTracker, false)); } diff --git a/llvm/tools/llvm-profgen/CSPreInliner.h b/llvm/tools/llvm-profgen/CSPreInliner.h index 42771914ea3fb..09dd2dec1149c 100644 --- a/llvm/tools/llvm-profgen/CSPreInliner.h +++ b/llvm/tools/llvm-profgen/CSPreInliner.h @@ -67,7 +67,7 @@ using ProfiledCandidateQueue = // size by only keep context that is estimated to be inlined. class CSPreInliner { public: - CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, + CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary, ProfileSummary *Summary); void run(); @@ -77,10 +77,9 @@ class CSPreInliner { std::vector buildTopDownOrder(); void processFunction(StringRef Name); bool shouldInline(ProfiledInlineCandidate &Candidate); - uint32_t getFuncSize(const FunctionSamples &FSamples); + uint32_t getFuncSize(const ContextTrieNode *ContextNode); bool UseContextCost; - SampleContextTracker ContextTracker; - SampleProfileMap &ProfileMap; + SampleContextTracker &ContextTracker; ProfiledBinary &Binary; ProfileSummary *Summary; }; diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp index 90df77171cb60..1d6a5246398ae 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -456,18 +456,10 @@ bool ProfileGenerator::collectFunctionsFromLLVMProfile( bool CSProfileGenerator::collectFunctionsFromLLVMProfile( std::unordered_set &ProfiledFunctions) { - std::queue NodeQueue; - NodeQueue.push(&getRootContext()); - while (!NodeQueue.empty()) { - ContextTrieNode *Node = NodeQueue.front(); - NodeQueue.pop(); - + for (auto *Node : ContextTracker) { if (!Node->getFuncName().empty()) if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) ProfiledFunctions.insert(Func); - - for (auto &It : Node->getAllChildContext()) - NodeQueue.push(&It.second); } return true; } @@ -806,22 +798,13 @@ void CSProfileGenerator::computeSizeForProfiledFunctions() { } void CSProfileGenerator::updateFunctionSamples() { - std::queue NodeQueue; - NodeQueue.push(&getRootContext()); - - while (!NodeQueue.empty()) { - ContextTrieNode *Node = NodeQueue.front(); - NodeQueue.pop(); - + for (auto *Node : ContextTracker) { FunctionSamples *FSamples = Node->getFunctionSamples(); if (FSamples) { if (UpdateTotalSamples) FSamples->updateTotalSamples(); FSamples->updateCallsiteSamples(); } - - for (auto &It : Node->getAllChildContext()) - NodeQueue.push(&It.second); } } @@ -999,17 +982,18 @@ void CSProfileGenerator::postProcessProfiles() { // context profile merging/trimming. computeSummaryAndThreshold(); - convertToProfileMap(); - // Run global pre-inliner to adjust/merge context profile based on estimated // inline decisions. if (EnableCSPreInliner) { - CSPreInliner(ProfileMap, *Binary, Summary.get()).run(); + ContextTracker.populateFuncToCtxtMap(); + CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); // Turn off the profile merger by default unless it is explicitly enabled. if (!CSProfMergeColdContext.getNumOccurrences()) CSProfMergeColdContext = false; } + convertToProfileMap(); + // Trim and merge cold context profile using cold threshold above. if (TrimColdProfile || CSProfMergeColdContext) { SampleContextTrimmer(ProfileMap) diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp index 3f51455b1802d..5c054d67b682f 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -83,25 +83,23 @@ void BinarySizeContextTracker::addInstructionForContext( } uint32_t -BinarySizeContextTracker::getFuncSizeForContext(const SampleContext &Context) { +BinarySizeContextTracker::getFuncSizeForContext(const ContextTrieNode *Node) { ContextTrieNode *CurrNode = &RootContext; ContextTrieNode *PrevNode = nullptr; - SampleContextFrames Frames = Context.getContextFrames(); - int32_t I = Frames.size() - 1; + Optional Size; // Start from top-level context-less function, traverse down the reverse // context trie to find the best/longest match for given context, then // retrieve the size. - - while (CurrNode && I >= 0) { - // Process from leaf function to callers (added to context). - const auto &ChildFrame = Frames[I--]; + LineLocation CallSiteLoc(0, 0); + while (CurrNode && Node->getParentContext() != nullptr) { PrevNode = CurrNode; - CurrNode = - CurrNode->getChildContext(ChildFrame.Location, ChildFrame.FuncName); + CurrNode = CurrNode->getChildContext(CallSiteLoc, Node->getFuncName()); if (CurrNode && CurrNode->getFunctionSize()) Size = CurrNode->getFunctionSize().getValue(); + CallSiteLoc = Node->getCallSiteLoc(); + Node = Node->getParentContext(); } // If we traversed all nodes along the path of the context and haven't diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h index ee585fc0abd63..c09931656d9fb 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -161,7 +161,7 @@ class BinarySizeContextTracker { // Get function size with a specific context. When there's no exact match // for the given context, try to retrieve the size of that function from // closest matching context. - uint32_t getFuncSizeForContext(const SampleContext &Context); + uint32_t getFuncSizeForContext(const ContextTrieNode *Context); // For inlinees that are full optimized away, we can establish zero size using // their remaining probes. @@ -485,8 +485,8 @@ class ProfiledBinary { return &I->second; } - uint32_t getFuncSizeForContext(SampleContext &Context) { - return FuncSizeTracker.getFuncSizeForContext(Context); + uint32_t getFuncSizeForContext(const ContextTrieNode *ContextNode) { + return FuncSizeTracker.getFuncSizeForContext(ContextNode); } // Load the symbols from debug table and populate into symbol list.