From 34146696cfcc765a212c5fb75fd3d0840761858a Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 11 Nov 2025 15:02:23 -0800
Subject: [PATCH 1/2] [MemProf] Fixup edges for largest N cold contexts

We build the callsite graph by first adding nodes and edges for all
allocation contexts, then match the interior callsite nodes onto actual
calls (IR or summary), which due to inlining may result in the
generation of new nodes representing the inlined context sequence. We
attempt to update edges correctly during this process, but in the case
of recursion this becomes impossible to always get correct.
Specifically, when creating new inlined sequence nodes for stack ids on
recursive cycles we can't always update correctly, because we have lost
the original ordering of the context.

This PR introduces a mechanism, guarded by -memprof-top-n-important=
flag, to keep track of extra information for the largest N cold
contexts. Another flag -memprof-fixup-important (enabled by default)
will perform more expensive fixup of the edges for those largest N cold
contexts, by saving and walking the original ordered list of stack ids
from the context.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 266 ++++++++++++++++--
 llvm/test/ThinLTO/X86/memprof-fixup.ll        | 130 +++++++++
 .../MemProfContextDisambiguation/fixup.ll     | 106 +++++++
 3 files changed, 486 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-fixup.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index d35ae4730a9f3..8c693801be2b6 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments,
 STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
 STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
 STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
+STATISTIC(NumImportantContextIds, "Number of important context ids");
+STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
+STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
+STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+cl::opt<unsigned> MemProfTopNImportant(
+    "memprof-top-n-important", cl::init(0), cl::Hidden,
+    cl::desc("Number of largest cold contexts to consider important"));
+
+cl::opt<bool> MemProfFixupImportant(
+    "memprof-fixup-important", cl::init(true), cl::Hidden,
+    cl::desc("Enables edge fixup for important contexts"));
+
 } // namespace llvm
 
 namespace {
+
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
 /// The graph represents the call contexts in all memprof metadata on allocation
@@ -581,17 +594,26 @@ class CallsiteContextGraph {
 
   /// Adds nodes for the given MIB stack ids.
   template <class NodeT, class IteratorT>
-  void addStackNodesForMIB(ContextNode *AllocNode,
-                           CallStack<NodeT, IteratorT> &StackContext,
-                           CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType,
-                           ArrayRef<ContextTotalSize> ContextSizeInfo);
+  void addStackNodesForMIB(
+      ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+      CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+      ArrayRef<ContextTotalSize> ContextSizeInfo,
+      std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
   /// inlining performed on those callsite instructions.
   void updateStackNodes();
 
+  /// Optionally fixup edges for the N largest cold contexts to better enable
+  /// cloning. This is particularly helpful if the context includes recursion
+  /// as well as inlining, resulting in a single stack node for multiple stack
+  /// ids in the context. With recursion it is particularly difficult to get the
+  /// edge updates correct as in the general case we have lost the original
+  /// stack id ordering for the context. Do more expensive fixup for the largest
+  /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
+  void fixupImportantContexts();
+
   /// Update graph to conservatively handle any callsite stack nodes that target
   /// multiple different callee target functions.
   void handleCallsitesWithMultipleTargets();
@@ -658,7 +680,8 @@ class CallsiteContextGraph {
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
-      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+      const DenseSet<uint32_t> &ImportantContextIds);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -859,6 +882,50 @@ class CallsiteContextGraph {
   /// nodes.
   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
 
+  /// Saves information for the contexts identified as important (the largest
+  /// cold contexts up to MemProfTopNImportant).
+  struct ImportantContextInfo {
+    // The original list of stack ids corresponding to this context.
+    std::vector<uint64_t> StackIds;
+    // Max length of stack ids corresponding to a single stack ContextNode for
+    // this context (i.e. the max length of a key in StackIdsToNode below).
+    unsigned MaxLength = 0;
+    // Mapping of slices of the stack ids to the corresponding ContextNode
+    // (there can be multiple stack ids due to inlining). Populated when
+    // updating stack nodes while matching them to the IR or summary.
+    std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
+  };
+
+  // Map of important full context ids to information about each.
+  DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
+
+  // For each important context id found in Node (if any), records the list of
+  // stack ids that corresponded to the given callsite Node. There can be more
+  // than one in the case of inlining.
+  void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
+                       // We pass in the Node's context ids to avoid the
+                       // overhead of computing them as the caller already has
+                       // them in some cases.
+                       const DenseSet<uint32_t> &NodeContextIds,
+                       const DenseSet<uint32_t> &ImportantContextIds) {
+    if (!MemProfTopNImportant) {
+      assert(ImportantContextIds.empty());
+      return;
+    }
+    DenseSet<uint32_t> Ids =
+        set_intersection(NodeContextIds, ImportantContextIds);
+    if (Ids.empty())
+      return;
+    auto Size = StackIds.size();
+    for (auto Id : Ids) {
+      auto &Entry = ImportantContextIdInfo[Id];
+      Entry.StackIdsToNode[StackIds] = Node;
+      // Keep track of the max to simplify later analysis.
+      if (Size > Entry.MaxLength)
+        Entry.MaxLength = Size;
+    }
+  }
+
   /// Maps to track the calls to their corresponding nodes in the graph.
   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
@@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    ArrayRef<ContextTotalSize> ContextSizeInfo) {
+    ArrayRef<ContextTotalSize> ContextSizeInfo,
+    std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1361,8 +1429,34 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
+  bool IsImportant = false;
   if (!ContextSizeInfo.empty()) {
     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    // If this is a cold allocation, and we are collecting non-zero largest
+    // contexts, see if this is a candidate.
+    if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
+      uint64_t TotalCold = 0;
+      for (auto &CSI : ContextSizeInfo) {
+        TotalCold += CSI.TotalSize;
+      }
+      // Record this context if either we haven't found the first top-n largest
+      // yet, or if it is larger than the smallest already recorded.
+      if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
+          // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
+          // sorted in ascending size of its key which is the size.
+          TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
+        if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
+          // Remove old one and its associated entries.
+          auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
+          TotalSizeToContextIdTopNCold.erase(
+              TotalSizeToContextIdTopNCold.begin());
+          assert(ImportantContextIdInfo.count(IdToRemove));
+          ImportantContextIdInfo.erase(IdToRemove);
+        }
+        TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
+        IsImportant = true;
+      }
+    }
     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
@@ -1381,6 +1475,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
        ContextIter != StackContext.end(); ++ContextIter) {
     auto StackId = getStackId(*ContextIter);
+    if (IsImportant)
+      ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
     ContextNode *StackNode = getNodeForStackId(StackId);
     if (!StackNode) {
       StackNode = createNewNode(/*IsAllocation=*/false);
@@ -1600,11 +1696,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(
-        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-        DenseMap<uint64_t, std::vector<CallContextInfo>>
-            &StackIdToMatchingCalls,
-        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
+    assignStackNodesPostOrder(ContextNode *Node,
+                              DenseSet<const ContextNode *> &Visited,
+                              DenseMap<uint64_t, std::vector<CallContextInfo>>
+                                  &StackIdToMatchingCalls,
+                              DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+                              const DenseSet<uint32_t> &ImportantContextIds) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1620,7 +1717,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       continue;
     }
     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1648,6 +1745,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       Node->setCall(Call);
       NonAllocationCallToContextNodeMap[Call] = Node;
       NodeToCallingFunc[Node] = Func;
+      recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
       return;
     }
   }
@@ -1786,6 +1884,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
                                 : CurNode->computeAllocType();
       PrevNode = CurNode;
     }
+
+    recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
+
     if (VerifyNodes) {
       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
       for (auto Id : Ids) {
@@ -1798,6 +1899,125 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   }
 }
 
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+                          CallTy>::fixupImportantContexts() {
+  if (ImportantContextIdInfo.empty())
+    return;
+
+  // Update statistics as we are done building this map at this point.
+  NumImportantContextIds = ImportantContextIdInfo.size();
+
+  if (!MemProfFixupImportant)
+    return;
+
+  if (ExportToDot)
+    exportToDot("beforestackfixup");
+
+  // For each context we identified as important, walk through the saved context
+  // stack ids in order from leaf upwards, and make sure all edges are correct.
+  // These can be difficult to get right when updating the graph while mapping
+  // nodes onto summary or IR, especially when there is recursion. In
+  // particular, when we have created new nodes to reflect inlining, it is
+  // sometimes impossible to know exactly how to update the edges in the face of
+  // recursion, as we have lost the original ordering of the stack ids in the
+  // contexts.
+  // TODO: Consider only doing this if we detect the context has recursive
+  // cycles.
+  //
+  // I.e. assume we have a context with stack ids like: {A B A C A D E}
+  // and let's say A was inlined into B, C, and D. The original graph will have
+  // multiple recursive cycles through A. When we match the original context
+  // nodes onto the IR or summary, we will merge {A B} into one context node,
+  // {A C} onto another, and {A D} onto another. Looking at the stack sequence
+  // above, we should end up with a non-cyclic set of edges like:
+  // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
+  // original ordering, we won't get the edges correct initially (it's
+  // impossible without the original ordering). Here we do the fixup (add and
+  // removing edges where necessary) for this context. In the
+  // ImportantContextInfo struct in this case we should have a MaxLength = 2,
+  // and map entries for {A B}, {A C}, {A D}, and {E}.
+  for (auto &[CurId, Info] : ImportantContextIdInfo) {
+    if (Info.StackIdsToNode.empty())
+      continue;
+    bool Changed = false;
+    ContextNode *PrevNode = nullptr;
+    ContextNode *CurNode = nullptr;
+    DenseSet<const ContextEdge *> VisitedEdges;
+    ArrayRef<uint64_t> AllStackIds(Info.StackIds);
+    // Try to identify what callsite ContextNode maps to which slice of the
+    // context's ordered stack ids.
+    for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
+      // We will do this greedily, trying up to MaxLengh stack ids in a row, to
+      // see if we recorded a context node for that sequence.
+      auto Len = Info.MaxLength;
+      auto LenToEnd = AllStackIds.size() - I;
+      if (Len > LenToEnd)
+        Len = LenToEnd;
+      CurNode = nullptr;
+      // Try to find a recorded context node starting with the longest length
+      // recorded, and on down until we check for just a single stack node.
+      for (; Len > 0; Len--) {
+        // Get the slice of the original stack id sequence to check.
+        auto CheckStackIds = AllStackIds.slice(I, Len);
+        auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
+        if (EntryIt == Info.StackIdsToNode.end())
+          continue;
+        CurNode = EntryIt->second;
+        // Skip forward so we don't try to look for the ones we just matched.
+        // We increment by Len - 1, because the outer for loop will increment I.
+        I += Len - 1;
+        break;
+      }
+      // Give up if we couldn't find a node. Since we need to clone from the
+      // leaf allocation upwards, no sense in doing anymore fixup further up
+      // the context if we couldn't match part of the original stack context
+      // onto a callsite node.
+      if (!CurNode)
+        break;
+      // No edges to fix up until we have a pair of nodes that should be
+      // adjacent in the graph.
+      if (!PrevNode)
+        continue;
+      // See if we already have a call edge from CurNode to PrevNode.
+      auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
+      if (CurEdge) {
+        // We already have an edge. Make sure it contains this context id.
+        if (!CurEdge->getContextIds().contains(CurId)) {
+          CurEdge->getContextIds().insert(CurId);
+          NumFixupEdgeIdsInserted++;
+          Changed = true;
+        }
+      } else {
+        // No edge exists - add one.
+        NumFixupEdgesAdded++;
+        DenseSet<uint32_t> ContextIds({CurId});
+        auto NewEdge = std::make_shared<ContextEdge>(
+            PrevNode, CurNode, computeAllocType(ContextIds), ContextIds);
+        PrevNode->CallerEdges.push_back(NewEdge);
+        CurNode->CalleeEdges.push_back(NewEdge);
+        // Save the new edge for the below handling.
+        CurEdge = NewEdge.get();
+        Changed = true;
+      }
+      VisitedEdges.insert(CurEdge);
+      // Now remove this context id from any other caller edges calling
+      // PrevNode.
+      for (auto &Edge : PrevNode->CallerEdges) {
+        // Skip the edge updating/created above, and edges we have already
+        // visited (due to recursion), or edges that we don't need to update
+        // because they don't contain the context id we're working on.
+        if (Edge.get() == CurEdge || VisitedEdges.contains(Edge.get()) ||
+            !Edge->getContextIds().contains(CurId))
+          continue;
+        Edge->getContextIds().erase(CurId);
+      }
+    }
+    if (Changed)
+      NumFixedContexts++;
+  }
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // Map of stack id to all calls with that as the last (outermost caller)
@@ -2043,9 +2263,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // nodes representing any inlining at interior callsites. Note we move the
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
+  DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
+                                         ImportantContextIdInfo.keys());
   for (auto &Entry : AllocationCallToContextNodeMap)
     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
+
+  fixupImportantContexts();
+
   if (VerifyCCG)
     check();
 }
@@ -2155,6 +2380,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
     Module &M,
     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
     : Mod(M), OREGetter(OREGetter) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &F : M) {
     std::vector<CallInfo> CallsWithMetadata;
     for (auto &BB : F) {
@@ -2191,7 +2420,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), ContextSizeInfo);
+                getMIBAllocType(MIBMD), ContextSizeInfo,
+                TotalSizeToContextIdTopNCold);
           }
           // If exporting the graph to dot and an allocation id of interest was
           // specified, record all the context ids for this allocation node.
@@ -2241,6 +2471,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing)
     : Index(Index), isPrevailing(isPrevailing) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &I : Index) {
     auto VI = Index.getValueInfo(I);
     for (auto &S : VI.getSummaryList()) {
@@ -2288,7 +2522,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
             }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                ContextSizeInfo);
+                ContextSizeInfo, TotalSizeToContextIdTopNCold);
             I++;
           }
           // If exporting the graph to dot and an allocation id of interest was
diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll
new file mode 100644
index 0000000000000..e22cc52a8d784
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll
@@ -0,0 +1,130 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; Need context sizes in summary, so enable reporting.
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
+
+;; First try default which does not enable any detection of the largest
+;; cold contexts. We will not get any cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Specify detection of the largest cold context, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=1 \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Specify detection of largest cold context, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=1 \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: created clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
new file mode 100644
index 0000000000000..8a5d0368f16db
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
@@ -0,0 +1,106 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; First try default which does not enable any detection of the largest
+;; cold contexts. We will not get any cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Specify detection of the largest cold context, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=1 \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Specify detection of largest cold context, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=1 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: created clone DB.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}

From e956921fed64b0fd4bf7a101bd559975c7d2b514 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 17 Nov 2025 14:25:44 -0800
Subject: [PATCH 2/2] Address comments and change option default

---
 .../IPO/MemProfContextDisambiguation.cpp      | 30 ++++++++-----------
 llvm/test/ThinLTO/X86/memprof-fixup.ll        | 11 ++++---
 .../MemProfContextDisambiguation/fixup.ll     | 11 ++++---
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 8c693801be2b6..0f4bc649df720 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -228,7 +228,7 @@ extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
 cl::opt<unsigned> MemProfTopNImportant(
-    "memprof-top-n-important", cl::init(0), cl::Hidden,
+    "memprof-top-n-important", cl::init(10), cl::Hidden,
     cl::desc("Number of largest cold contexts to consider important"));
 
 cl::opt<bool> MemProfFixupImportant(
@@ -885,7 +885,7 @@ class CallsiteContextGraph {
   /// Saves information for the contexts identified as important (the largest
   /// cold contexts up to MemProfTopNImportant).
   struct ImportantContextInfo {
-    // The original list of stack ids corresponding to this context.
+    // The original list of leaf first stack ids corresponding to this context.
     std::vector<uint64_t> StackIds;
     // Max length of stack ids corresponding to a single stack ContextNode for
     // this context (i.e. the max length of a key in StackIdsToNode below).
@@ -1436,9 +1436,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     // contexts, see if this is a candidate.
     if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
       uint64_t TotalCold = 0;
-      for (auto &CSI : ContextSizeInfo) {
+      for (auto &CSI : ContextSizeInfo)
         TotalCold += CSI.TotalSize;
-      }
       // Record this context if either we haven't found the first top-n largest
       // yet, or if it is larger than the smallest already recorded.
       if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
@@ -1937,7 +1936,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
   // removing edges where necessary) for this context. In the
   // ImportantContextInfo struct in this case we should have a MaxLength = 2,
   // and map entries for {A B}, {A C}, {A D}, and {E}.
-  for (auto &[CurId, Info] : ImportantContextIdInfo) {
+  for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
     if (Info.StackIdsToNode.empty())
       continue;
     bool Changed = false;
@@ -1948,7 +1947,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
     // Try to identify what callsite ContextNode maps to which slice of the
     // context's ordered stack ids.
     for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
-      // We will do this greedily, trying up to MaxLengh stack ids in a row, to
+      // We will do this greedily, trying up to MaxLength stack ids in a row, to
       // see if we recorded a context node for that sequence.
       auto Len = Info.MaxLength;
       auto LenToEnd = AllStackIds.size() - I;
@@ -1983,17 +1982,17 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
       auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
       if (CurEdge) {
         // We already have an edge. Make sure it contains this context id.
-        if (!CurEdge->getContextIds().contains(CurId)) {
-          CurEdge->getContextIds().insert(CurId);
+        if (CurEdge->getContextIds().insert(CurContextId).second) {
           NumFixupEdgeIdsInserted++;
           Changed = true;
         }
       } else {
         // No edge exists - add one.
         NumFixupEdgesAdded++;
-        DenseSet<uint32_t> ContextIds({CurId});
+        DenseSet<uint32_t> ContextIds({CurContextId});
+        auto AllocType = computeAllocType(ContextIds);
         auto NewEdge = std::make_shared<ContextEdge>(
-            PrevNode, CurNode, computeAllocType(ContextIds), ContextIds);
+            PrevNode, CurNode, AllocType, std::move(ContextIds));
         PrevNode->CallerEdges.push_back(NewEdge);
         CurNode->CalleeEdges.push_back(NewEdge);
         // Save the new edge for the below handling.
@@ -2004,13 +2003,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
       // Now remove this context id from any other caller edges calling
       // PrevNode.
       for (auto &Edge : PrevNode->CallerEdges) {
-        // Skip the edge updating/created above, and edges we have already
-        // visited (due to recursion), or edges that we don't need to update
-        // because they don't contain the context id we're working on.
-        if (Edge.get() == CurEdge || VisitedEdges.contains(Edge.get()) ||
-            !Edge->getContextIds().contains(CurId))
-          continue;
-        Edge->getContextIds().erase(CurId);
+        // Skip the edge updating/created above and edges we have already
+        // visited (due to recursion).
+        if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
+          Edge->getContextIds().erase(CurContextId);
       }
     }
     if (Changed)
diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll
index e22cc52a8d784..afed80fc562c1 100644
--- a/llvm/test/ThinLTO/X86/memprof-fixup.ll
+++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll
@@ -9,10 +9,11 @@
 ;; Need context sizes in summary, so enable reporting.
 ; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
 
-;; First try default which does not enable any detection of the largest
-;; cold contexts. We will not get any cloning.
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
 ; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
 ; RUN:  -r=%t.o,E,plx \
 ; RUN:  -r=%t.o,DB,plx \
 ; RUN:  -r=%t.o,CB,plx \
@@ -27,11 +28,10 @@
 ; RUN:	--implicit-check-not="Number of important context ids" \
 ; RUN:	--implicit-check-not="Number of fixup"
 
-;; Specify detection of the largest cold context, but disable fixup.
+;; Allow default detection of the largest cold contexts, but disable fixup.
 ;; We should find 1 important context, but still not get cloning.
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
 ; RUN:  -supports-hot-cold-new \
-; RUN:  -memprof-top-n-important=1 \
 ; RUN:  -memprof-fixup-important=false \
 ; RUN:  -r=%t.o,E,plx \
 ; RUN:  -r=%t.o,DB,plx \
@@ -49,11 +49,10 @@
 
 ; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
 
-;; Specify detection of largest cold context, fixup is enabled by default.
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
 ;; This case should get fixup and cloning.
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
 ; RUN:  -supports-hot-cold-new \
-; RUN:  -memprof-top-n-important=1 \
 ; RUN:  -r=%t.o,E,plx \
 ; RUN:  -r=%t.o,DB,plx \
 ; RUN:  -r=%t.o,CB,plx \
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
index 8a5d0368f16db..a08f89b5bbe97 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
@@ -6,9 +6,10 @@
 ;; -stats requires asserts
 ; REQUIRES: asserts
 
-;; First try default which does not enable any detection of the largest
-;; cold contexts. We will not get any cloning.
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
 ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
 ; RUNL	-pass-remarks=memprof-context-disambiguation \
 ; RUN:  %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \
@@ -17,10 +18,9 @@
 ; RUN:	--implicit-check-not="Number of important context ids" \
 ; RUN:	--implicit-check-not="Number of fixup"
 
-;; Specify detection of the largest cold context, but disable fixup.
+;; Allow default detection of the largest cold contexts, but disable fixup.
 ;; We should find 1 important context, but still not get cloning.
 ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
-; RUN:  -memprof-top-n-important=1 \
 ; RUN:  -memprof-fixup-important=false \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
 ; RUNL	-pass-remarks=memprof-context-disambiguation \
@@ -32,10 +32,9 @@
 
 ; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
 
-;; Specify detection of largest cold context, fixup is enabled by default.
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
 ;; This case should get fixup and cloning.
 ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
-; RUN:  -memprof-top-n-important=1 \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
 ; RUN:	-pass-remarks=memprof-context-disambiguation \
 ; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1