From 34146696cfcc765a212c5fb75fd3d0840761858a Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 11 Nov 2025 15:02:23 -0800 Subject: [PATCH 1/2] [MemProf] Fixup edges for largest N cold contexts We build the callsite graph by first adding nodes and edges for all allocation contexts, then match the interior callsite nodes onto actual calls (IR or summary), which due to inlining may result in the generation of new nodes representing the inlined context sequence. We attempt to update edges correctly during this process, but in the case of recursion this becomes impossible to always get correct. Specifically, when creating new inlined sequence nodes for stack ids on recursive cycles we can't always update correctly, because we have lost the original ordering of the context. This PR introduces a mechanism, guarded by -memprof-top-n-important= flag, to keep track of extra information for the largest N cold contexts. Another flag -memprof-fixup-important (enabled by default) will perform more expensive fixup of the edges for those largest N cold contexts, by saving and walking the original ordered list of stack ids from the context. --- .../IPO/MemProfContextDisambiguation.cpp | 266 ++++++++++++++++-- llvm/test/ThinLTO/X86/memprof-fixup.ll | 130 +++++++++ .../MemProfContextDisambiguation/fixup.ll | 106 +++++++ 3 files changed, 486 insertions(+), 16 deletions(-) create mode 100644 llvm/test/ThinLTO/X86/memprof-fixup.ll create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index d35ae4730a9f3..8c693801be2b6 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments, STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes"); STATISTIC(TotalMergeIters, "Number of merge iterations for nodes"); STATISTIC(MaxMergeIters, "Max merge iterations for nodes"); +STATISTIC(NumImportantContextIds, "Number of important context ids"); +STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted"); +STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added"); +STATISTIC(NumFixedContexts, "Number of contexts with fixed edges"); static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -223,9 +227,18 @@ static cl::opt MemProfRequireDefinitionForPromotion( extern cl::opt MemProfReportHintedSizes; extern cl::opt MinClonedColdBytePercent; +cl::opt MemProfTopNImportant( + "memprof-top-n-important", cl::init(0), cl::Hidden, + cl::desc("Number of largest cold contexts to consider important")); + +cl::opt MemProfFixupImportant( + "memprof-fixup-important", cl::init(true), cl::Hidden, + cl::desc("Enables edge fixup for important contexts")); + } // namespace llvm namespace { + /// CRTP base for graphs built from either IR or ThinLTO summary index. /// /// The graph represents the call contexts in all memprof metadata on allocation @@ -581,17 +594,26 @@ class CallsiteContextGraph { /// Adds nodes for the given MIB stack ids. template - void addStackNodesForMIB(ContextNode *AllocNode, - CallStack &StackContext, - CallStack &CallsiteContext, - AllocationType AllocType, - ArrayRef ContextSizeInfo); + void addStackNodesForMIB( + ContextNode *AllocNode, CallStack &StackContext, + CallStack &CallsiteContext, AllocationType AllocType, + ArrayRef ContextSizeInfo, + std::map &TotalSizeToContextIdTopNCold); /// Matches all callsite metadata (or summary) to the nodes created for /// allocation memprof MIB metadata, synthesizing new nodes to reflect any /// inlining performed on those callsite instructions. void updateStackNodes(); + /// Optionally fixup edges for the N largest cold contexts to better enable + /// cloning. This is particularly helpful if the context includes recursion + /// as well as inlining, resulting in a single stack node for multiple stack + /// ids in the context. With recursion it is particularly difficult to get the + /// edge updates correct as in the general case we have lost the original + /// stack id ordering for the context. Do more expensive fixup for the largest + /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant. + void fixupImportantContexts(); + /// Update graph to conservatively handle any callsite stack nodes that target /// multiple different callee target functions. void handleCallsitesWithMultipleTargets(); @@ -658,7 +680,8 @@ class CallsiteContextGraph { void assignStackNodesPostOrder( ContextNode *Node, DenseSet &Visited, DenseMap> &StackIdToMatchingCalls, - DenseMap &CallToMatchingCall); + DenseMap &CallToMatchingCall, + const DenseSet &ImportantContextIds); /// Duplicates the given set of context ids, updating the provided /// map from each original id with the newly generated context ids, @@ -859,6 +882,50 @@ class CallsiteContextGraph { /// nodes. DenseMap StackEntryIdToContextNodeMap; + /// Saves information for the contexts identified as important (the largest + /// cold contexts up to MemProfTopNImportant). + struct ImportantContextInfo { + // The original list of stack ids corresponding to this context. + std::vector StackIds; + // Max length of stack ids corresponding to a single stack ContextNode for + // this context (i.e. the max length of a key in StackIdsToNode below). + unsigned MaxLength = 0; + // Mapping of slices of the stack ids to the corresponding ContextNode + // (there can be multiple stack ids due to inlining). Populated when + // updating stack nodes while matching them to the IR or summary. + std::map, ContextNode *> StackIdsToNode; + }; + + // Map of important full context ids to information about each. + DenseMap ImportantContextIdInfo; + + // For each important context id found in Node (if any), records the list of + // stack ids that corresponded to the given callsite Node. There can be more + // than one in the case of inlining. + void recordStackNode(std::vector &StackIds, ContextNode *Node, + // We pass in the Node's context ids to avoid the + // overhead of computing them as the caller already has + // them in some cases. + const DenseSet &NodeContextIds, + const DenseSet &ImportantContextIds) { + if (!MemProfTopNImportant) { + assert(ImportantContextIds.empty()); + return; + } + DenseSet Ids = + set_intersection(NodeContextIds, ImportantContextIds); + if (Ids.empty()) + return; + auto Size = StackIds.size(); + for (auto Id : Ids) { + auto &Entry = ImportantContextIdInfo[Id]; + Entry.StackIdsToNode[StackIds] = Node; + // Keep track of the max to simplify later analysis. + if (Size > Entry.MaxLength) + Entry.MaxLength = Size; + } + } + /// Maps to track the calls to their corresponding nodes in the graph. MapVector AllocationCallToContextNodeMap; MapVector NonAllocationCallToContextNodeMap; @@ -1353,7 +1420,8 @@ template void CallsiteContextGraph::addStackNodesForMIB( ContextNode *AllocNode, CallStack &StackContext, CallStack &CallsiteContext, AllocationType AllocType, - ArrayRef ContextSizeInfo) { + ArrayRef ContextSizeInfo, + std::map &TotalSizeToContextIdTopNCold) { // Treating the hot alloc type as NotCold before the disambiguation for "hot" // is done. if (AllocType == AllocationType::Hot) @@ -1361,8 +1429,34 @@ void CallsiteContextGraph::addStackNodesForMIB( ContextIdToAllocationType[++LastContextId] = AllocType; + bool IsImportant = false; if (!ContextSizeInfo.empty()) { auto &Entry = ContextIdToContextSizeInfos[LastContextId]; + // If this is a cold allocation, and we are collecting non-zero largest + // contexts, see if this is a candidate. + if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) { + uint64_t TotalCold = 0; + for (auto &CSI : ContextSizeInfo) { + TotalCold += CSI.TotalSize; + } + // Record this context if either we haven't found the first top-n largest + // yet, or if it is larger than the smallest already recorded. + if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant || + // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly + // sorted in ascending size of its key which is the size. + TotalCold > TotalSizeToContextIdTopNCold.begin()->first) { + if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) { + // Remove old one and its associated entries. + auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second; + TotalSizeToContextIdTopNCold.erase( + TotalSizeToContextIdTopNCold.begin()); + assert(ImportantContextIdInfo.count(IdToRemove)); + ImportantContextIdInfo.erase(IdToRemove); + } + TotalSizeToContextIdTopNCold[TotalCold] = LastContextId; + IsImportant = true; + } + } Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); } @@ -1381,6 +1475,8 @@ void CallsiteContextGraph::addStackNodesForMIB( for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); ContextIter != StackContext.end(); ++ContextIter) { auto StackId = getStackId(*ContextIter); + if (IsImportant) + ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId); ContextNode *StackNode = getNodeForStackId(StackId); if (!StackNode) { StackNode = createNewNode(/*IsAllocation=*/false); @@ -1600,11 +1696,12 @@ static void checkNode(const ContextNode *Node, template void CallsiteContextGraph:: - assignStackNodesPostOrder( - ContextNode *Node, DenseSet &Visited, - DenseMap> - &StackIdToMatchingCalls, - DenseMap &CallToMatchingCall) { + assignStackNodesPostOrder(ContextNode *Node, + DenseSet &Visited, + DenseMap> + &StackIdToMatchingCalls, + DenseMap &CallToMatchingCall, + const DenseSet &ImportantContextIds) { auto Inserted = Visited.insert(Node); if (!Inserted.second) return; @@ -1620,7 +1717,7 @@ void CallsiteContextGraph:: continue; } assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls, - CallToMatchingCall); + CallToMatchingCall, ImportantContextIds); } // If this node's stack id is in the map, update the graph to contain new @@ -1648,6 +1745,7 @@ void CallsiteContextGraph:: Node->setCall(Call); NonAllocationCallToContextNodeMap[Call] = Node; NodeToCallingFunc[Node] = Func; + recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds); return; } } @@ -1786,6 +1884,9 @@ void CallsiteContextGraph:: : CurNode->computeAllocType(); PrevNode = CurNode; } + + recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds); + if (VerifyNodes) { checkNode(NewNode, /*CheckEdges=*/true); for (auto Id : Ids) { @@ -1798,6 +1899,125 @@ void CallsiteContextGraph:: } } +template +void CallsiteContextGraph::fixupImportantContexts() { + if (ImportantContextIdInfo.empty()) + return; + + // Update statistics as we are done building this map at this point. + NumImportantContextIds = ImportantContextIdInfo.size(); + + if (!MemProfFixupImportant) + return; + + if (ExportToDot) + exportToDot("beforestackfixup"); + + // For each context we identified as important, walk through the saved context + // stack ids in order from leaf upwards, and make sure all edges are correct. + // These can be difficult to get right when updating the graph while mapping + // nodes onto summary or IR, especially when there is recursion. In + // particular, when we have created new nodes to reflect inlining, it is + // sometimes impossible to know exactly how to update the edges in the face of + // recursion, as we have lost the original ordering of the stack ids in the + // contexts. + // TODO: Consider only doing this if we detect the context has recursive + // cycles. + // + // I.e. assume we have a context with stack ids like: {A B A C A D E} + // and let's say A was inlined into B, C, and D. The original graph will have + // multiple recursive cycles through A. When we match the original context + // nodes onto the IR or summary, we will merge {A B} into one context node, + // {A C} onto another, and {A D} onto another. Looking at the stack sequence + // above, we should end up with a non-cyclic set of edges like: + // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the + // original ordering, we won't get the edges correct initially (it's + // impossible without the original ordering). Here we do the fixup (add and + // removing edges where necessary) for this context. In the + // ImportantContextInfo struct in this case we should have a MaxLength = 2, + // and map entries for {A B}, {A C}, {A D}, and {E}. + for (auto &[CurId, Info] : ImportantContextIdInfo) { + if (Info.StackIdsToNode.empty()) + continue; + bool Changed = false; + ContextNode *PrevNode = nullptr; + ContextNode *CurNode = nullptr; + DenseSet VisitedEdges; + ArrayRef AllStackIds(Info.StackIds); + // Try to identify what callsite ContextNode maps to which slice of the + // context's ordered stack ids. + for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) { + // We will do this greedily, trying up to MaxLengh stack ids in a row, to + // see if we recorded a context node for that sequence. + auto Len = Info.MaxLength; + auto LenToEnd = AllStackIds.size() - I; + if (Len > LenToEnd) + Len = LenToEnd; + CurNode = nullptr; + // Try to find a recorded context node starting with the longest length + // recorded, and on down until we check for just a single stack node. + for (; Len > 0; Len--) { + // Get the slice of the original stack id sequence to check. + auto CheckStackIds = AllStackIds.slice(I, Len); + auto EntryIt = Info.StackIdsToNode.find(CheckStackIds); + if (EntryIt == Info.StackIdsToNode.end()) + continue; + CurNode = EntryIt->second; + // Skip forward so we don't try to look for the ones we just matched. + // We increment by Len - 1, because the outer for loop will increment I. + I += Len - 1; + break; + } + // Give up if we couldn't find a node. Since we need to clone from the + // leaf allocation upwards, no sense in doing anymore fixup further up + // the context if we couldn't match part of the original stack context + // onto a callsite node. + if (!CurNode) + break; + // No edges to fix up until we have a pair of nodes that should be + // adjacent in the graph. + if (!PrevNode) + continue; + // See if we already have a call edge from CurNode to PrevNode. + auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode); + if (CurEdge) { + // We already have an edge. Make sure it contains this context id. + if (!CurEdge->getContextIds().contains(CurId)) { + CurEdge->getContextIds().insert(CurId); + NumFixupEdgeIdsInserted++; + Changed = true; + } + } else { + // No edge exists - add one. + NumFixupEdgesAdded++; + DenseSet ContextIds({CurId}); + auto NewEdge = std::make_shared( + PrevNode, CurNode, computeAllocType(ContextIds), ContextIds); + PrevNode->CallerEdges.push_back(NewEdge); + CurNode->CalleeEdges.push_back(NewEdge); + // Save the new edge for the below handling. + CurEdge = NewEdge.get(); + Changed = true; + } + VisitedEdges.insert(CurEdge); + // Now remove this context id from any other caller edges calling + // PrevNode. + for (auto &Edge : PrevNode->CallerEdges) { + // Skip the edge updating/created above, and edges we have already + // visited (due to recursion), or edges that we don't need to update + // because they don't contain the context id we're working on. + if (Edge.get() == CurEdge || VisitedEdges.contains(Edge.get()) || + !Edge->getContextIds().contains(CurId)) + continue; + Edge->getContextIds().erase(CurId); + } + } + if (Changed) + NumFixedContexts++; + } +} + template void CallsiteContextGraph::updateStackNodes() { // Map of stack id to all calls with that as the last (outermost caller) @@ -2043,9 +2263,14 @@ void CallsiteContextGraph::updateStackNodes() { // nodes representing any inlining at interior callsites. Note we move the // associated context ids over to the new nodes. DenseSet Visited; + DenseSet ImportantContextIds(llvm::from_range, + ImportantContextIdInfo.keys()); for (auto &Entry : AllocationCallToContextNodeMap) assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls, - CallToMatchingCall); + CallToMatchingCall, ImportantContextIds); + + fixupImportantContexts(); + if (VerifyCCG) check(); } @@ -2155,6 +2380,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( Module &M, llvm::function_ref OREGetter) : Mod(M), OREGetter(OREGetter) { + // Map for keeping track of the largest cold contexts up to the number given + // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys + // must be sorted. + std::map TotalSizeToContextIdTopNCold; for (auto &F : M) { std::vector CallsWithMetadata; for (auto &BB : F) { @@ -2191,7 +2420,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( CallStack StackContext(StackNode); addStackNodesForMIB( AllocNode, StackContext, CallsiteContext, - getMIBAllocType(MIBMD), ContextSizeInfo); + getMIBAllocType(MIBMD), ContextSizeInfo, + TotalSizeToContextIdTopNCold); } // If exporting the graph to dot and an allocation id of interest was // specified, record all the context ids for this allocation node. @@ -2241,6 +2471,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( llvm::function_ref isPrevailing) : Index(Index), isPrevailing(isPrevailing) { + // Map for keeping track of the largest cold contexts up to the number given + // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys + // must be sorted. + std::map TotalSizeToContextIdTopNCold; for (auto &I : Index) { auto VI = Index.getValueInfo(I); for (auto &S : VI.getSummaryList()) { @@ -2288,7 +2522,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( } addStackNodesForMIB::const_iterator>( AllocNode, StackContext, EmptyContext, MIB.AllocType, - ContextSizeInfo); + ContextSizeInfo, TotalSizeToContextIdTopNCold); I++; } // If exporting the graph to dot and an allocation id of interest was diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll new file mode 100644 index 0000000000000..e22cc52a8d784 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll @@ -0,0 +1,130 @@ +;; Test fixup of largest cold contexts. + +;; This case has multiple recursive cycles in the cold context, which can be +;; made non-recursive with the inlining in the code. + +;; -stats requires asserts +; REQUIRES: asserts + +;; Need context sizes in summary, so enable reporting. +; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o + +;; First try default which does not enable any detection of the largest +;; cold contexts. We will not get any cloning. +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,E,plx \ +; RUN: -r=%t.o,DB,plx \ +; RUN: -r=%t.o,CB,plx \ +; RUN: -r=%t.o,A,plx \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t.out 2>&1 | FileCheck %s --implicit-check-not="created clone" \ +; RUN: --implicit-check-not="Number of cold static allocations" \ +; RUN: --implicit-check-not="Number of function clones" \ +; RUN: --implicit-check-not="Number of important context ids" \ +; RUN: --implicit-check-not="Number of fixup" + +;; Specify detection of the largest cold context, but disable fixup. +;; We should find 1 important context, but still not get cloning. +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -memprof-top-n-important=1 \ +; RUN: -memprof-fixup-important=false \ +; RUN: -r=%t.o,E,plx \ +; RUN: -r=%t.o,DB,plx \ +; RUN: -r=%t.o,CB,plx \ +; RUN: -r=%t.o,A,plx \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \ +; RUN: --implicit-check-not="created clone" \ +; RUN: --implicit-check-not="Number of cold static allocations" \ +; RUN: --implicit-check-not="Number of function clones" \ +; RUN: --implicit-check-not="Number of fixup" + +; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids + +;; Specify detection of largest cold context, fixup is enabled by default. +;; This case should get fixup and cloning. +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -memprof-top-n-important=1 \ +; RUN: -r=%t.o,E,plx \ +; RUN: -r=%t.o,DB,plx \ +; RUN: -r=%t.o,CB,plx \ +; RUN: -r=%t.o,A,plx \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1 + +; TOPN1: created clone E.memprof.1 +; TOPN1: call in clone E marked with memprof allocation attribute notcold +; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold +; TOPN1: created clone DB.memprof.1 +; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1 +; TOPN1: created clone CB.memprof.1 +; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1 +; TOPN1: created clone A.memprof.1 +; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1 +; TOPN1: call in clone main assigned to call function clone A.memprof.1 + +; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges +; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added +; TOPN1: 1 memprof-context-disambiguation - Number of important context ids + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @E() { +entry: + %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14 + ret void +} + +define void @DB() { +entry: + tail call void @E(), !callsite !17 + ret void +} + +define void @CB() { +entry: + tail call void @DB(), !callsite !22 + ret void +} + +define void @A() { +entry: + tail call void @CB(), !callsite !20 + ret void +} + +define i32 @main() { +entry: + tail call void @A(), !callsite !25 + tail call void @A(), !callsite !27 + ret i32 0 +} + +declare ptr @_Znam(i64) + +!7 = !{!8, !10} +!8 = !{!9, !"cold", !2} +!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678} +!2 = !{i64 12345, i64 200} +!10 = !{!11, !"notcold", !3} +!3 = !{i64 23456, i64 200} +!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789} +!14 = !{i64 123} +!17 = !{i64 234, i64 345} +!22 = !{i64 234, i64 456} +!20 = !{i64 234, i64 567} +!25 = !{i64 678} +!27 = !{i64 789} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll new file mode 100644 index 0000000000000..8a5d0368f16db --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll @@ -0,0 +1,106 @@ +;; Test fixup of largest cold contexts. + +;; This case has multiple recursive cycles in the cold context, which can be +;; made non-recursive with the inlining in the code. + +;; -stats requires asserts +; REQUIRES: asserts + +;; First try default which does not enable any detection of the largest +;; cold contexts. We will not get any cloning. +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUNL -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \ +; RUN: --implicit-check-not="Number of cold static allocations" \ +; RUN: --implicit-check-not="Number of function clones" \ +; RUN: --implicit-check-not="Number of important context ids" \ +; RUN: --implicit-check-not="Number of fixup" + +;; Specify detection of the largest cold context, but disable fixup. +;; We should find 1 important context, but still not get cloning. +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-top-n-important=1 \ +; RUN: -memprof-fixup-important=false \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUNL -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \ +; RUN: --implicit-check-not="created clone" \ +; RUN: --implicit-check-not="Number of cold static allocations" \ +; RUN: --implicit-check-not="Number of function clones" \ +; RUN: --implicit-check-not="Number of fixup" + +; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids + +;; Specify detection of largest cold context, fixup is enabled by default. +;; This case should get fixup and cloning. +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-top-n-important=1 \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1 + +; TOPN1: created clone E.memprof.1 +; TOPN1: created clone DB.memprof.1 +; TOPN1: created clone CB.memprof.1 +; TOPN1: created clone A.memprof.1 +; TOPN1: call in clone main assigned to call function clone A.memprof.1 +; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1 +; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1 +; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1 +; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold +; TOPN1: call in clone E marked with memprof allocation attribute notcold + +; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges +; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added +; TOPN1: 1 memprof-context-disambiguation - Number of important context ids + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @E() { +entry: + %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14 + ret void +} + +define void @DB() { +entry: + tail call void @E(), !callsite !17 + ret void +} + +define void @CB() { +entry: + tail call void @DB(), !callsite !22 + ret void +} + +define void @A() { +entry: + tail call void @CB(), !callsite !20 + ret void +} + +define i32 @main() { +entry: + tail call void @A(), !callsite !25 + tail call void @A(), !callsite !27 + ret i32 0 +} + +declare ptr @_Znam(i64) + +!7 = !{!8, !10} +!8 = !{!9, !"cold", !2} +!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678} +!2 = !{i64 12345, i64 200} +!10 = !{!11, !"notcold", !3} +!3 = !{i64 23456, i64 200} +!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789} +!14 = !{i64 123} +!17 = !{i64 234, i64 345} +!22 = !{i64 234, i64 456} +!20 = !{i64 234, i64 567} +!25 = !{i64 678} +!27 = !{i64 789} From e956921fed64b0fd4bf7a101bd559975c7d2b514 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Mon, 17 Nov 2025 14:25:44 -0800 Subject: [PATCH 2/2] Address comments and change option default --- .../IPO/MemProfContextDisambiguation.cpp | 30 ++++++++----------- llvm/test/ThinLTO/X86/memprof-fixup.ll | 11 ++++--- .../MemProfContextDisambiguation/fixup.ll | 11 ++++--- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 8c693801be2b6..0f4bc649df720 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -228,7 +228,7 @@ extern cl::opt MemProfReportHintedSizes; extern cl::opt MinClonedColdBytePercent; cl::opt MemProfTopNImportant( - "memprof-top-n-important", cl::init(0), cl::Hidden, + "memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important")); cl::opt MemProfFixupImportant( @@ -885,7 +885,7 @@ class CallsiteContextGraph { /// Saves information for the contexts identified as important (the largest /// cold contexts up to MemProfTopNImportant). struct ImportantContextInfo { - // The original list of stack ids corresponding to this context. + // The original list of leaf first stack ids corresponding to this context. std::vector StackIds; // Max length of stack ids corresponding to a single stack ContextNode for // this context (i.e. the max length of a key in StackIdsToNode below). @@ -1436,9 +1436,8 @@ void CallsiteContextGraph::addStackNodesForMIB( // contexts, see if this is a candidate. if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) { uint64_t TotalCold = 0; - for (auto &CSI : ContextSizeInfo) { + for (auto &CSI : ContextSizeInfo) TotalCold += CSI.TotalSize; - } // Record this context if either we haven't found the first top-n largest // yet, or if it is larger than the smallest already recorded. if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant || @@ -1937,7 +1936,7 @@ void CallsiteContextGraphfindEdgeFromCaller(CurNode); if (CurEdge) { // We already have an edge. Make sure it contains this context id. - if (!CurEdge->getContextIds().contains(CurId)) { - CurEdge->getContextIds().insert(CurId); + if (CurEdge->getContextIds().insert(CurContextId).second) { NumFixupEdgeIdsInserted++; Changed = true; } } else { // No edge exists - add one. NumFixupEdgesAdded++; - DenseSet ContextIds({CurId}); + DenseSet ContextIds({CurContextId}); + auto AllocType = computeAllocType(ContextIds); auto NewEdge = std::make_shared( - PrevNode, CurNode, computeAllocType(ContextIds), ContextIds); + PrevNode, CurNode, AllocType, std::move(ContextIds)); PrevNode->CallerEdges.push_back(NewEdge); CurNode->CalleeEdges.push_back(NewEdge); // Save the new edge for the below handling. @@ -2004,13 +2003,10 @@ void CallsiteContextGraphCallerEdges) { - // Skip the edge updating/created above, and edges we have already - // visited (due to recursion), or edges that we don't need to update - // because they don't contain the context id we're working on. - if (Edge.get() == CurEdge || VisitedEdges.contains(Edge.get()) || - !Edge->getContextIds().contains(CurId)) - continue; - Edge->getContextIds().erase(CurId); + // Skip the edge updating/created above and edges we have already + // visited (due to recursion). + if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get())) + Edge->getContextIds().erase(CurContextId); } } if (Changed) diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll index e22cc52a8d784..afed80fc562c1 100644 --- a/llvm/test/ThinLTO/X86/memprof-fixup.ll +++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll @@ -9,10 +9,11 @@ ;; Need context sizes in summary, so enable reporting. ; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o -;; First try default which does not enable any detection of the largest -;; cold contexts. We will not get any cloning. +;; First try disabling detection of the largest cold contexts. +;; We will not get any cloning. ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ +; RUN: -memprof-top-n-important=0 \ ; RUN: -r=%t.o,E,plx \ ; RUN: -r=%t.o,DB,plx \ ; RUN: -r=%t.o,CB,plx \ @@ -27,11 +28,10 @@ ; RUN: --implicit-check-not="Number of important context ids" \ ; RUN: --implicit-check-not="Number of fixup" -;; Specify detection of the largest cold context, but disable fixup. +;; Allow default detection of the largest cold contexts, but disable fixup. ;; We should find 1 important context, but still not get cloning. ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ -; RUN: -memprof-top-n-important=1 \ ; RUN: -memprof-fixup-important=false \ ; RUN: -r=%t.o,E,plx \ ; RUN: -r=%t.o,DB,plx \ @@ -49,11 +49,10 @@ ; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids -;; Specify detection of largest cold context, fixup is enabled by default. +;; Allow default detection of largest cold contexts, fixup is enabled by default. ;; This case should get fixup and cloning. ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ -; RUN: -memprof-top-n-important=1 \ ; RUN: -r=%t.o,E,plx \ ; RUN: -r=%t.o,DB,plx \ ; RUN: -r=%t.o,CB,plx \ diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll index 8a5d0368f16db..a08f89b5bbe97 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll @@ -6,9 +6,10 @@ ;; -stats requires asserts ; REQUIRES: asserts -;; First try default which does not enable any detection of the largest -;; cold contexts. We will not get any cloning. +;; First try disabling detection of the largest cold contexts. +;; We will not get any cloning. ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-top-n-important=0 \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUNL -pass-remarks=memprof-context-disambiguation \ ; RUN: %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \ @@ -17,10 +18,9 @@ ; RUN: --implicit-check-not="Number of important context ids" \ ; RUN: --implicit-check-not="Number of fixup" -;; Specify detection of the largest cold context, but disable fixup. +;; Allow default detection of the largest cold contexts, but disable fixup. ;; We should find 1 important context, but still not get cloning. ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ -; RUN: -memprof-top-n-important=1 \ ; RUN: -memprof-fixup-important=false \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUNL -pass-remarks=memprof-context-disambiguation \ @@ -32,10 +32,9 @@ ; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids -;; Specify detection of largest cold context, fixup is enabled by default. +;; Allow default detection of largest cold contexts, fixup is enabled by default. ;; This case should get fixup and cloning. ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ -; RUN: -memprof-top-n-important=1 \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1