From f09807ca9dda2f588298d8733e89a81105c88120 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 4 May 2023 09:37:25 -0700 Subject: [PATCH] Revert "Restore "[MemProf] Context disambiguation cloning pass [patch 3/4]"" This reverts commit bfe7205975a63a605ff3faacd97fe4c1bf4c19b3, and follow on fix e3e6bc699574550f2ed1de07f4e5bcdddaa65557, due to some remaining instability exposed by the bot enabling expensive checks: https://lab.llvm.org/buildbot/#/builders/42/builds/9842 --- .../IPO/MemProfContextDisambiguation.h | 5 +- .../IPO/MemProfContextDisambiguation.cpp | 701 +----------------- llvm/test/ThinLTO/X86/memprof-basic.ll | 35 +- .../X86/memprof-duplicate-context-ids.ll | 47 +- .../ThinLTO/X86/memprof-funcassigncloning.ll | 235 ------ llvm/test/ThinLTO/X86/memprof-indirectcall.ll | 31 +- llvm/test/ThinLTO/X86/memprof-inlined.ll | 28 +- .../MemProfContextDisambiguation/basic.ll | 48 +- .../duplicate-context-ids.ll | 40 +- .../funcassigncloning.ll | 247 ------ .../indirectcall.ll | 41 +- .../MemProfContextDisambiguation/inlined.ll | 41 +- 12 files changed, 31 insertions(+), 1468 deletions(-) delete mode 100644 llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h index 13f3a7eb7ce3f..475ea48cca932 100644 --- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h @@ -25,14 +25,11 @@ namespace llvm { class GlobalValueSummary; class Module; class ModuleSummaryIndex; -class OptimizationRemarkEmitter; class MemProfContextDisambiguation : public PassInfoMixin { /// Run the context disambiguator on \p M, returns true if any changes made. - bool processModule( - Module &M, - function_ref OREGetter); + bool processModule(Module &M); public: MemProfContextDisambiguation() {} diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 3fff7e55cfe3f..5c8aaddfe3bb3 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -27,10 +27,8 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -41,7 +39,6 @@ #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/Utils/Cloning.h" #include #include using namespace llvm; @@ -49,13 +46,6 @@ using namespace llvm::memprof; #define DEBUG_TYPE "memprof-context-disambiguation" -STATISTIC(FunctionClonesAnalysis, - "Number of function clones created during whole program analysis"); -STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " - "cloned) during whole program analysis"); -STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " - "during whole program analysis"); - static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), @@ -105,13 +95,6 @@ class CallsiteContextGraph { /// behavior of an allocation based on its context. void identifyClones(); - /// Assign callsite clones to functions, cloning functions as needed to - /// accommodate the combinations of their callsite clones reached by callers. - /// For regular LTO this clones functions and callsites in the IR, but for - /// ThinLTO the cloning decisions are noted in the summaries and applied - /// later. - bool assignFunctions(); - void dump() const; void print(raw_ostream &OS) const; @@ -392,28 +375,6 @@ class CallsiteContextGraph { return static_cast(this)->getLastStackId(Call); } - /// Update the allocation call to record type of allocated memory. - void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { - AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; - static_cast(this)->updateAllocationCall(Call, AllocType); - } - - /// Update non-allocation call to invoke (possibly cloned) function - /// CalleeFunc. - void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - static_cast(this)->updateCall(CallerCall, CalleeFunc); - } - - /// Clone the given function for the given callsite, recording mapping of all - /// of the functions tracked calls to their new versions in the CallMap. - /// Assigns new clones to clone number CloneNo. - FuncInfo cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map &CallMap, - std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { - return static_cast(this)->cloneFunctionForCallsite( - Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); - } - /// Gets a label to use in the dot graph for the given call clone in the given /// function. std::string getLabel(const FuncTy *Func, const CallTy Call, @@ -508,9 +469,7 @@ class ModuleCallsiteContextGraph : public CallsiteContextGraph { public: - ModuleCallsiteContextGraph( - Module &M, - function_ref OREGetter); + ModuleCallsiteContextGraph(Module &M); private: friend CallsiteContextGraph getStackIdsWithContextNodesForCall(Instruction *Call); - void updateAllocationCall(CallInfo &Call, AllocationType AllocType); - void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); - CallsiteContextGraph::FuncInfo - cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map &CallMap, - std::vector &CallsWithMetadataInFunc, - unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const; const Module &Mod; - function_ref OREGetter; }; /// Represents a call in the summary index graph, which can either be an @@ -577,14 +527,6 @@ class IndexCallsiteContextGraph bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); uint64_t getLastStackId(IndexCall &Call); std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); - void updateAllocationCall(CallInfo &Call, AllocationType AllocType); - void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); - CallsiteContextGraph::FuncInfo - cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map &CallMap, - std::vector &CallsWithMetadataInFunc, - unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, unsigned CloneNo) const; @@ -1340,14 +1282,10 @@ uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { return Index.getStackIdAtIndex(CallsiteContext.back()); } -static const std::string MemProfCloneSuffix = ".memprof."; - static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { - // We use CloneNo == 0 to refer to the original version, which doesn't get - // renamed with a suffix. if (!CloneNo) return Base.str(); - return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); + return (Base + ".memprof." + Twine(CloneNo)).str(); } std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, @@ -1409,9 +1347,7 @@ CallsiteContextGraph::getStackIdsWithContextNodes( return StackIds; } -ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( - Module &M, function_ref OREGetter) - : Mod(M), OREGetter(OREGetter) { +ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) { for (auto &F : M) { std::vector CallsWithMetadata; for (auto &BB : F) { @@ -1725,7 +1661,7 @@ static void checkEdge( template static void checkNode(const ContextNode *Node, - bool CheckEdges = true) { + bool CheckEdges = false) { if (Node->isRemoved()) return; // Node's context ids should be the union of both its callee and caller edge @@ -1765,7 +1701,7 @@ template void CallsiteContextGraph::check() const { using GraphType = const CallsiteContextGraph *; for (const auto Node : nodes(this)) { - checkNode(Node, /*CheckEdges=*/false); + checkNode(Node); for (auto &Edge : Node->CallerEdges) checkEdge(Edge); } @@ -1989,14 +1925,12 @@ void CallsiteContextGraph:: NewEdge->Callee->CallerEdges.push_back(NewEdge); } if (VerifyCCG) { - checkNode(OldCallee, /*CheckEdges=*/false); - checkNode(NewCallee, /*CheckEdges=*/false); + checkNode(OldCallee); + checkNode(NewCallee); for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) - checkNode(OldCalleeEdge->Callee, - /*CheckEdges=*/false); + checkNode(OldCalleeEdge->Callee); for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) - checkNode(NewCalleeEdge->Callee, - /*CheckEdges=*/false); + checkNode(NewCalleeEdge->Callee); } } @@ -2011,7 +1945,7 @@ template void CallsiteContextGraph::identifyClones( ContextNode *Node, DenseSet &Visited) { if (VerifyNodes) - checkNode(Node); + checkNode(Node, /*CheckEdges=*/true); assert(!Node->CloneOf); // If Node as a null call, then either it wasn't found in the module (regular @@ -2165,7 +2099,7 @@ void CallsiteContextGraph::identifyClones( for (auto *Clone : Node->Clones) { removeNoneTypeCalleeEdges(Clone); if (VerifyNodes) - checkNode(Clone); + checkNode(Clone, /*CheckEdges=*/true); } // We should still have some context ids on the original Node. assert(!Node->ContextIds.empty()); @@ -2186,595 +2120,7 @@ void CallsiteContextGraph::identifyClones( })); if (VerifyNodes) - checkNode(Node); -} - -static std::string getAllocTypeAttributeString(AllocationType Type) { - switch (Type) { - case AllocationType::NotCold: - return "notcold"; - break; - case AllocationType::Cold: - return "cold"; - break; - default: - dbgs() << "Unexpected alloc type " << (uint8_t)Type; - assert(false); - } - llvm_unreachable("invalid alloc type"); -} - -void ModuleCallsiteContextGraph::updateAllocationCall( - CallInfo &Call, AllocationType AllocType) { - std::string AllocTypeString = getAllocTypeAttributeString(AllocType); - auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), - "memprof", AllocTypeString); - cast(Call.call())->addFnAttr(A); - OREGetter(Call.call()->getFunction()) - .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) - << ore::NV("AllocationCall", Call.call()) << " in clone " - << ore::NV("Caller", Call.call()->getFunction()) - << " marked with memprof allocation attribute " - << ore::NV("Attribute", AllocTypeString)); -} - -void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, - AllocationType AllocType) { - auto *AI = Call.call().dyn_cast(); - assert(AI); - assert(AI->Versions.size() > Call.cloneNo()); - AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; -} - -void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, - FuncInfo CalleeFunc) { - if (CalleeFunc.cloneNo() > 0) - cast(CallerCall.call())->setCalledFunction(CalleeFunc.func()); - OREGetter(CallerCall.call()->getFunction()) - .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) - << ore::NV("Call", CallerCall.call()) << " in clone " - << ore::NV("Caller", CallerCall.call()->getFunction()) - << " assigned to call function clone " - << ore::NV("Callee", CalleeFunc.func())); -} - -void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, - FuncInfo CalleeFunc) { - auto *CI = CallerCall.call().dyn_cast(); - assert(CI && - "Caller cannot be an allocation which should not have profiled calls"); - assert(CI->Clones.size() > CallerCall.cloneNo()); - CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); -} - -CallsiteContextGraph::FuncInfo -ModuleCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map &CallMap, - std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { - // Use existing LLVM facilities for cloning and obtaining Call in clone - ValueToValueMapTy VMap; - auto *NewFunc = CloneFunction(Func.func(), VMap); - std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); - assert(!Func.func()->getParent()->getFunction(Name)); - NewFunc->setName(Name); - for (auto &Inst : CallsWithMetadataInFunc) { - // This map always has the initial version in it. - assert(Inst.cloneNo() == 0); - CallMap[Inst] = {cast(VMap[Inst.call()]), CloneNo}; - } - OREGetter(Func.func()) - .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) - << "created clone " << ore::NV("NewFunction", NewFunc)); - return {NewFunc, CloneNo}; -} - -CallsiteContextGraph::FuncInfo -IndexCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map &CallMap, - std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { - // Check how many clones we have of Call (and therefore function). - // The next clone number is the current size of versions array. - // Confirm this matches the CloneNo provided by the caller, which is based on - // the number of function clones we have. - assert(CloneNo == - (Call.call().is() - ? Call.call().dyn_cast()->Versions.size() - : Call.call().dyn_cast()->Clones.size())); - // Walk all the instructions in this function. Create a new version for - // each (by adding an entry to the Versions/Clones summary array), and copy - // over the version being called for the function clone being cloned here. - // Additionally, add an entry to the CallMap for the new function clone, - // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) - // to the new call clone. - for (auto &Inst : CallsWithMetadataInFunc) { - // This map always has the initial version in it. - assert(Inst.cloneNo() == 0); - if (auto *AI = Inst.call().dyn_cast()) { - assert(AI->Versions.size() == CloneNo); - // We assign the allocation type later (in updateAllocationCall), just add - // an entry for it here. - AI->Versions.push_back(0); - } else { - auto *CI = Inst.call().dyn_cast(); - assert(CI && CI->Clones.size() == CloneNo); - // We assign the clone number later (in updateCall), just add an entry for - // it here. - CI->Clones.push_back(0); - } - CallMap[Inst] = {Inst.call(), CloneNo}; - } - return {Func.func(), CloneNo}; -} - -// This method assigns cloned callsites to functions, cloning the functions as -// needed. The assignment is greedy and proceeds roughly as follows: -// -// For each function Func: -// For each call with graph Node having clones: -// Initialize ClonesWorklist to Node and its clones -// Initialize NodeCloneCount to 0 -// While ClonesWorklist is not empty: -// Clone = pop front ClonesWorklist -// NodeCloneCount++ -// If Func has been cloned less than NodeCloneCount times: -// If NodeCloneCount is 1: -// Assign Clone to original Func -// Continue -// Create a new function clone -// If other callers not assigned to call a function clone yet: -// Assign them to call new function clone -// Continue -// Assign any other caller calling the cloned version to new clone -// -// For each caller of Clone: -// If caller is assigned to call a specific function clone: -// If we cannot assign Clone to that function clone: -// Create new callsite Clone NewClone -// Add NewClone to ClonesWorklist -// Continue -// Assign Clone to existing caller's called function clone -// Else: -// If Clone not already assigned to a function clone: -// Assign to first function clone without assignment -// Assign caller to selected function clone -template -bool CallsiteContextGraph::assignFunctions() { - bool Changed = false; - - // Keep track of the assignment of nodes (callsites) to function clones they - // call. - DenseMap CallsiteToCalleeFuncCloneMap; - - // Update caller node to call function version CalleeFunc, by recording the - // assignment in CallsiteToCalleeFuncCloneMap. - auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, - const FuncInfo &CalleeFunc) { - assert(Caller->hasCall()); - CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; - }; - - // Walk all functions for which we saw calls with memprof metadata, and handle - // cloning for each of its calls. - for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { - FuncInfo OrigFunc(Func); - // Map from each clone of OrigFunc to a map of remappings of each call of - // interest (from original uncloned call to the corresponding cloned call in - // that function clone). - std::map> FuncClonesToCallMap; - for (auto &Call : CallsWithMetadata) { - ContextNode *Node = getNodeForInst(Call); - // Skip call if we do not have a node for it (all uses of its stack ids - // were either on inlined chains or pruned from the MIBs), or if we did - // not create any clones for it. - if (!Node || Node->Clones.empty()) - continue; - assert(Node->hasCall() && - "Not having a call should have prevented cloning"); - - // Track the assignment of function clones to clones of the current - // callsite Node being handled. - std::map FuncCloneToCurNodeCloneMap; - - // Assign callsite version CallsiteClone to function version FuncClone, - // and also assign (possibly cloned) Call to CallsiteClone. - auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, - CallInfo &Call, - ContextNode *CallsiteClone, - bool IsAlloc) { - // Record the clone of callsite node assigned to this function clone. - FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; - - assert(FuncClonesToCallMap.count(FuncClone)); - std::map &CallMap = FuncClonesToCallMap[FuncClone]; - CallInfo CallClone(Call); - if (CallMap.count(Call)) - CallClone = CallMap[Call]; - CallsiteClone->setCall(CallClone); - }; - - // Keep track of the clones of callsite Node that need to be assigned to - // function clones. This list may be expanded in the loop body below if we - // find additional cloning is required. - std::deque ClonesWorklist; - // Ignore original Node if we moved all of its contexts to clones. - if (!Node->ContextIds.empty()) - ClonesWorklist.push_back(Node); - ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), - Node->Clones.end()); - - // Now walk through all of the clones of this callsite Node that we need, - // and determine the assignment to a corresponding clone of the current - // function (creating new function clones as needed). - unsigned NodeCloneCount = 0; - while (!ClonesWorklist.empty()) { - ContextNode *Clone = ClonesWorklist.front(); - ClonesWorklist.pop_front(); - NodeCloneCount++; - if (VerifyNodes) - checkNode(Clone); - - // Need to create a new function clone if we have more callsite clones - // than existing function clones, which would have been assigned to an - // earlier clone in the list (we assign callsite clones to function - // clones greedily). - if (FuncClonesToCallMap.size() < NodeCloneCount) { - // If this is the first callsite copy, assign to original function. - if (NodeCloneCount == 1) { - // Since FuncClonesToCallMap is empty in this case, no clones have - // been created for this function yet, and no callers should have - // been assigned a function clone for this callee node yet. - assert(llvm::none_of( - Clone->CallerEdges, [&](const std::shared_ptr &E) { - return CallsiteToCalleeFuncCloneMap.count(E->Caller); - })); - // Initialize with empty call map, assign Clone to original function - // and its callers, and skip to the next clone. - FuncClonesToCallMap[OrigFunc] = {}; - AssignCallsiteCloneToFuncClone( - OrigFunc, Call, Clone, - AllocationCallToContextNodeMap.count(Call)); - for (auto &CE : Clone->CallerEdges) { - // Ignore any caller that does not have a recorded callsite Call. - if (!CE->Caller->hasCall()) - continue; - RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); - } - continue; - } - - // First locate which copy of OrigFunc to clone again. If a caller - // of this callsite clone was already assigned to call a particular - // function clone, we need to redirect all of those callers to the - // new function clone, and update their other callees within this - // function. - FuncInfo PreviousAssignedFuncClone; - auto EI = llvm::find_if( - Clone->CallerEdges, [&](const std::shared_ptr &E) { - return CallsiteToCalleeFuncCloneMap.count(E->Caller); - }); - bool CallerAssignedToCloneOfFunc = false; - if (EI != Clone->CallerEdges.end()) { - const std::shared_ptr &Edge = *EI; - PreviousAssignedFuncClone = - CallsiteToCalleeFuncCloneMap[Edge->Caller]; - CallerAssignedToCloneOfFunc = true; - } - - // Clone function and save it along with the CallInfo map created - // during cloning in the FuncClonesToCallMap. - std::map NewCallMap; - unsigned CloneNo = FuncClonesToCallMap.size(); - assert(CloneNo > 0 && "Clone 0 is the original function, which " - "should already exist in the map"); - FuncInfo NewFuncClone = cloneFunctionForCallsite( - OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); - FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); - FunctionClonesAnalysis++; - Changed = true; - - // If no caller callsites were already assigned to a clone of this - // function, we can simply assign this clone to the new func clone - // and update all callers to it, then skip to the next clone. - if (!CallerAssignedToCloneOfFunc) { - AssignCallsiteCloneToFuncClone( - NewFuncClone, Call, Clone, - AllocationCallToContextNodeMap.count(Call)); - for (auto &CE : Clone->CallerEdges) { - // Ignore any caller that does not have a recorded callsite Call. - if (!CE->Caller->hasCall()) - continue; - RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); - } - continue; - } - - // We may need to do additional node cloning in this case. - // Reset the CallsiteToCalleeFuncCloneMap entry for any callers - // that were previously assigned to call PreviousAssignedFuncClone, - // to record that they now call NewFuncClone. - for (auto CE : Clone->CallerEdges) { - // Ignore any caller that does not have a recorded callsite Call. - if (!CE->Caller->hasCall()) - continue; - - if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || - // We subsequently fall through to later handling that - // will perform any additional cloning required for - // callers that were calling other function clones. - CallsiteToCalleeFuncCloneMap[CE->Caller] != - PreviousAssignedFuncClone) - continue; - - RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); - - // If we are cloning a function that was already assigned to some - // callers, then essentially we are creating new callsite clones - // of the other callsites in that function that are reached by those - // callers. Clone the other callees of the current callsite's caller - // that were already assigned to PreviousAssignedFuncClone - // accordingly. This is important since we subsequently update the - // calls from the nodes in the graph and their assignments to callee - // functions recorded in CallsiteToCalleeFuncCloneMap. - for (auto CalleeEdge : CE->Caller->CalleeEdges) { - // Skip any that have been removed on an earlier iteration when - // cleaning up newly None type callee edges. - if (!CalleeEdge) - continue; - ContextNode *Callee = CalleeEdge->Callee; - // Skip the current callsite, we are looking for other - // callsites Caller calls, as well as any that does not have a - // recorded callsite Call. - if (Callee == Clone || !Callee->hasCall()) - continue; - ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); - removeNoneTypeCalleeEdges(NewClone); - // Moving the edge may have resulted in some none type - // callee edges on the original Callee. - removeNoneTypeCalleeEdges(Callee); - assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); - // If the Callee node was already assigned to call a specific - // function version, make sure its new clone is assigned to call - // that same function clone. - if (CallsiteToCalleeFuncCloneMap.count(Callee)) - RecordCalleeFuncOfCallsite( - NewClone, CallsiteToCalleeFuncCloneMap[Callee]); - // Update NewClone with the new Call clone of this callsite's Call - // created for the new function clone created earlier. - // Recall that we have already ensured when building the graph - // that each caller can only call callsites within the same - // function, so we are guaranteed that Callee Call is in the - // current OrigFunc. - // CallMap is set up as indexed by original Call at clone 0. - CallInfo OrigCall(Callee->getOrigNode()->Call); - OrigCall.setCloneNo(0); - std::map &CallMap = - FuncClonesToCallMap[NewFuncClone]; - assert(CallMap.count(OrigCall)); - CallInfo NewCall(CallMap[OrigCall]); - assert(NewCall); - NewClone->setCall(NewCall); - } - } - // Fall through to handling below to perform the recording of the - // function for this callsite clone. This enables handling of cases - // where the callers were assigned to different clones of a function. - } - - // See if we can use existing function clone. Walk through - // all caller edges to see if any have already been assigned to - // a clone of this callsite's function. If we can use it, do so. If not, - // because that function clone is already assigned to a different clone - // of this callsite, then we need to clone again. - // Basically, this checking is needed to handle the case where different - // caller functions/callsites may need versions of this function - // containing different mixes of callsite clones across the different - // callsites within the function. If that happens, we need to create - // additional function clones to handle the various combinations. - // - // Keep track of any new clones of this callsite created by the - // following loop, as well as any existing clone that we decided to - // assign this clone to. - std::map FuncCloneToNewCallsiteCloneMap; - FuncInfo FuncCloneAssignedToCurCallsiteClone; - // We need to be able to remove Edge from CallerEdges, so need to adjust - // iterator in the loop. - for (auto EI = Clone->CallerEdges.begin(); - EI != Clone->CallerEdges.end();) { - auto Edge = *EI; - // Ignore any caller that does not have a recorded callsite Call. - if (!Edge->Caller->hasCall()) { - EI++; - continue; - } - // If this caller already assigned to call a version of OrigFunc, need - // to ensure we can assign this callsite clone to that function clone. - if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { - FuncInfo FuncCloneCalledByCaller = - CallsiteToCalleeFuncCloneMap[Edge->Caller]; - // First we need to confirm that this function clone is available - // for use by this callsite node clone. - // - // While FuncCloneToCurNodeCloneMap is built only for this Node and - // its callsite clones, one of those callsite clones X could have - // been assigned to the same function clone called by Edge's caller - // - if Edge's caller calls another callsite within Node's original - // function, and that callsite has another caller reaching clone X. - // We need to clone Node again in this case. - if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && - FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != - Clone) || - // Detect when we have multiple callers of this callsite that - // have already been assigned to specific, and different, clones - // of OrigFunc (due to other unrelated callsites in Func they - // reach via call contexts). Is this Clone of callsite Node - // assigned to a different clone of OrigFunc? If so, clone Node - // again. - (FuncCloneAssignedToCurCallsiteClone && - FuncCloneAssignedToCurCallsiteClone != - FuncCloneCalledByCaller)) { - // We need to use a different newly created callsite clone, in - // order to assign it to another new function clone on a - // subsequent iteration over the Clones array (adjusted below). - // Note we specifically do not reset the - // CallsiteToCalleeFuncCloneMap entry for this caller, so that - // when this new clone is processed later we know which version of - // the function to copy (so that other callsite clones we have - // assigned to that function clone are properly cloned over). See - // comments in the function cloning handling earlier. - - // Check if we already have cloned this callsite again while - // walking through caller edges, for a caller calling the same - // function clone. If so, we can move this edge to that new clone - // rather than creating yet another new clone. - if (FuncCloneToNewCallsiteCloneMap.count( - FuncCloneCalledByCaller)) { - ContextNode *NewClone = - FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; - moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); - // Cleanup any none type edges cloned over. - removeNoneTypeCalleeEdges(NewClone); - } else { - // Create a new callsite clone. - ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); - removeNoneTypeCalleeEdges(NewClone); - FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = - NewClone; - // Add to list of clones and process later. - ClonesWorklist.push_back(NewClone); - assert(EI == Clone->CallerEdges.end() || - Clone->AllocTypes != (uint8_t)AllocationType::None); - assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); - } - // Moving the caller edge may have resulted in some none type - // callee edges. - removeNoneTypeCalleeEdges(Clone); - // We will handle the newly created callsite clone in a subsequent - // iteration over this Node's Clones. Continue here since we - // already adjusted iterator EI while moving the edge. - continue; - } - - // Otherwise, we can use the function clone already assigned to this - // caller. - if (!FuncCloneAssignedToCurCallsiteClone) { - FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; - // Assign Clone to FuncCloneCalledByCaller - AssignCallsiteCloneToFuncClone( - FuncCloneCalledByCaller, Call, Clone, - AllocationCallToContextNodeMap.count(Call)); - } else - // Don't need to do anything - callsite is already calling this - // function clone. - assert(FuncCloneAssignedToCurCallsiteClone == - FuncCloneCalledByCaller); - - } else { - // We have not already assigned this caller to a version of - // OrigFunc. Do the assignment now. - - // First check if we have already assigned this callsite clone to a - // clone of OrigFunc for another caller during this iteration over - // its caller edges. - if (!FuncCloneAssignedToCurCallsiteClone) { - // Find first function in FuncClonesToCallMap without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncClonesToCallMap size was smaller than the clone number. - for (auto &CF : FuncClonesToCallMap) { - if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { - FuncCloneAssignedToCurCallsiteClone = CF.first; - break; - } - } - assert(FuncCloneAssignedToCurCallsiteClone); - // Assign Clone to FuncCloneAssignedToCurCallsiteClone - AssignCallsiteCloneToFuncClone( - FuncCloneAssignedToCurCallsiteClone, Call, Clone, - AllocationCallToContextNodeMap.count(Call)); - } else - assert(FuncCloneToCurNodeCloneMap - [FuncCloneAssignedToCurCallsiteClone] == Clone); - // Update callers to record function version called. - RecordCalleeFuncOfCallsite(Edge->Caller, - FuncCloneAssignedToCurCallsiteClone); - } - - EI++; - } - } - if (VerifyCCG) { - checkNode(Node); - for (const auto &PE : Node->CalleeEdges) - checkNode(PE->Callee); - for (const auto &CE : Node->CallerEdges) - checkNode(CE->Caller); - for (auto *Clone : Node->Clones) { - checkNode(Clone); - for (const auto &PE : Clone->CalleeEdges) - checkNode(PE->Callee); - for (const auto &CE : Clone->CallerEdges) - checkNode(CE->Caller); - } - } - } - } - - auto UpdateCalls = [&](ContextNode *Node, - DenseSet &Visited, - auto &&UpdateCalls) { - auto Inserted = Visited.insert(Node); - if (!Inserted.second) - return; - - for (auto *Clone : Node->Clones) - UpdateCalls(Clone, Visited, UpdateCalls); - - for (auto &Edge : Node->CallerEdges) - UpdateCalls(Edge->Caller, Visited, UpdateCalls); - - // Skip if either no call to update, or if we ended up with no context ids - // (we moved all edges onto other clones). - if (!Node->hasCall() || Node->ContextIds.empty()) - return; - - if (Node->IsAllocation) { - updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); - return; - } - - if (!CallsiteToCalleeFuncCloneMap.count(Node)) - return; - - auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; - updateCall(Node->Call, CalleeFunc); - }; - - // Sort the allocation nodes based on the OrigStackOrAllocId, which increase - // in insertion order, so that the following loop is deterministic (since the - // AllocationCallToContextNodeMap is keyed by a pointer). Specifically this - // can affect the order of the remarks emitted for regular LTO IR updates - // during the call updating. - std::vector AllocationNodes; - AllocationNodes.reserve(AllocationCallToContextNodeMap.size()); - for (auto &Entry : AllocationCallToContextNodeMap) - AllocationNodes.push_back(Entry.second); - std::sort(AllocationNodes.begin(), AllocationNodes.end(), - [](const ContextNode *A, const ContextNode *B) { - return A->OrigStackOrAllocId < B->OrigStackOrAllocId; - }); - - // Performs DFS traversal starting from allocation nodes to update calls to - // reflect cloning decisions recorded earlier. For regular LTO this will - // update the actual calls in the IR to call the appropriate function clone - // (and add attributes to allocation calls), whereas for ThinLTO the decisions - // are recorded in the summary entries. - DenseSet Visited; - for (auto *AllocNode : AllocationNodes) - UpdateCalls(AllocNode, Visited, UpdateCalls); - - return Changed; + checkNode(Node, /*CheckEdges=*/true); } template @@ -2803,24 +2149,13 @@ bool CallsiteContextGraph::process() { if (ExportToDot) exportToDot("cloned"); - bool Changed = assignFunctions(); - - if (DumpCCG) { - dbgs() << "CCG after assigning function clones:\n"; - dbgs() << *this; - } - if (ExportToDot) - exportToDot("clonefuncassign"); - - return Changed; + return false; } -bool MemProfContextDisambiguation::processModule( - Module &M, - function_ref OREGetter) { +bool MemProfContextDisambiguation::processModule(Module &M) { bool Changed = false; - ModuleCallsiteContextGraph CCG(M, OREGetter); + ModuleCallsiteContextGraph CCG(M); Changed = CCG.process(); return Changed; @@ -2828,11 +2163,7 @@ bool MemProfContextDisambiguation::processModule( PreservedAnalyses MemProfContextDisambiguation::run(Module &M, ModuleAnalysisManager &AM) { - auto &FAM = AM.getResult(M).getManager(); - auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { - return FAM.getResult(*F); - }; - if (!processModule(M, OREGetter)) + if (!processModule(M)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll index 11378cf5bef47..4df89cdb12afd 100644 --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -42,35 +42,13 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -;; Try again but with distributed ThinLTO -; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ -; RUN: -thinlto-distributed-indexes \ -; RUN: -r=%t.o,main,plx \ -; RUN: -r=%t.o,_ZdaPv, \ -; RUN: -r=%t.o,sleep, \ -; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS - -; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT -;; We should have cloned bar, baz, and foo, for the cold memory allocation. -; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED - -;; Check distributed index -; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB - source_filename = "memprof-basic.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -252,11 +230,6 @@ uselistorder ptr @_Z3foov, { 1, 0 } ; DUMP: Clone of [[BAR]] -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; @@ -288,9 +261,3 @@ uselistorder ptr @_Z3foov, { 1, 0 } ; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; ; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; ; DOTCLONED: } - - -; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1) -; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1) -; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold) -; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll index 7f7447eaf58e4..12e2fc39b5f5e 100644 --- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -1,8 +1,7 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. Also tests graph and IR -;; cloning. +;; while matching callsite nodes onto the graph. ;; ;; Original code looks like: ;; @@ -64,9 +63,7 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST @@ -74,27 +71,6 @@ ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -;; Try again but with distributed ThinLTO -; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ -; RUN: -thinlto-distributed-indexes \ -; RUN: -r=%t.o,main,plx \ -; RUN: -r=%t.o,_ZdaPv, \ -; RUN: -r=%t.o,sleep, \ -; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS - -; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE -; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST -;; We should clone D once for the cold allocations via C. -; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED - -;; Check distributed index -; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB - source_filename = "duplicate-context-ids.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -131,13 +107,7 @@ entry: ret ptr null } -define i32 @main() { -entry: - call ptr @_Z1Bv() - call ptr @_Z1Ev() - call ptr @_Z1Fv() - ret i32 0 -} +declare i32 @main() declare void @_ZdaPv() @@ -301,11 +271,6 @@ declare i32 @sleep() ; DUMP: Clone of [[D]] -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOTPRE: digraph "prestackupdate" { ; DOTPRE: label="prestackupdate"; ; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; @@ -343,9 +308,3 @@ declare i32 @sleep() ; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"]; ; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; ; DOTCLONED: } - -; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1) -; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold) -; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1) -; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0) -; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll deleted file mode 100644 index 54aad0dc94ac0..0000000000000 --- a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll +++ /dev/null @@ -1,235 +0,0 @@ -;; Test context disambiguation for a callgraph containing multiple memprof -;; contexts and no inlining, where we need to perform additional cloning -;; during function assignment/cloning to handle the combination of contexts -;; to 2 different allocations. -;; -;; void E(char **buf1, char **buf2) { -;; *buf1 = new char[10]; -;; *buf2 = new char[10]; -;; } -;; -;; void B(char **buf1, char **buf2) { -;; E(buf1, buf2); -;; } -;; -;; void C(char **buf1, char **buf2) { -;; E(buf1, buf2); -;; } -;; -;; void D(char **buf1, char **buf2) { -;; E(buf1, buf2); -;; } -;; int main(int argc, char **argv) { -;; char *cold1, *cold2, *default1, *default2, *default3, *default4; -;; B(&default1, &default2); -;; C(&default3, &cold1); -;; D(&cold2, &default4); -;; memset(cold1, 0, 10); -;; memset(cold2, 0, 10); -;; memset(default1, 0, 10); -;; memset(default2, 0, 10); -;; memset(default3, 0, 10); -;; memset(default4, 0, 10); -;; delete[] default1; -;; delete[] default2; -;; delete[] default3; -;; delete[] default4; -;; sleep(10); -;; delete[] cold1; -;; delete[] cold2; -;; return 0; -;; } -;; -;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the -;; memory freed after sleep(10) results in cold lifetimes. -;; -;; The IR was then reduced using llvm-reduce with the expected FileCheck input. - -;; -stats requires asserts -; REQUIRES: asserts - - -; RUN: opt -thinlto-bc %s >%t.o -; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ -; RUN: -r=%t.o,main,plx \ -; RUN: -r=%t.o,_ZdaPv, \ -; RUN: -r=%t.o,sleep, \ -; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS - - -;; Try again but with distributed ThinLTO -; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ -; RUN: -thinlto-distributed-indexes \ -; RUN: -r=%t.o,main,plx \ -; RUN: -r=%t.o,_ZdaPv, \ -; RUN: -r=%t.o,sleep, \ -; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS - - -source_filename = "funcassigncloning.ll" -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: noinline optnone -define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) { -entry: - %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 - %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 - ret void -} - -declare ptr @_Znam(i64) - -define internal void @_Z1BPPcS0_() { -entry: - call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16 - ret void -} - -define internal void @_Z1CPPcS0_() { -entry: - call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17 - ret void -} - -define internal void @_Z1DPPcS0_() { -entry: - call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18 - ret void -} - -; Function Attrs: noinline optnone -define i32 @main() { -entry: - call void @_Z1BPPcS0_() - call void @_Z1CPPcS0_() - call void @_Z1DPPcS0_() - ret i32 0 -} - -declare void @_ZdaPv() - -declare i32 @sleep() - -; uselistorder directives -uselistorder ptr @_Znam, { 1, 0 } - -!0 = !{!1, !3, !5} -!1 = !{!2, !"cold"} -!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} -!3 = !{!4, !"notcold"} -!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} -!5 = !{!6, !"notcold"} -!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} -!7 = !{i64 -3461278137325233666} -!8 = !{!9, !11, !13} -!9 = !{!10, !"notcold"} -!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} -!11 = !{!12, !"cold"} -!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} -!13 = !{!14, !"notcold"} -!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} -!15 = !{i64 -1415475215210681400} -!16 = !{i64 -2441057035866683071} -!17 = !{i64 -3483158674395044949} -!18 = !{i64 -7799663586031895603} - - -;; Originally we create a single clone of each call to new from E, since each -;; allocates cold memory for a single caller. - -; DUMP: CCG after cloning: -; DUMP: Callsite Context Graph: -; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] -; DUMP: Versions: 1 MIB: -; DUMP: AllocType 2 StackIds: 0 -; DUMP: AllocType 1 StackIds: 1 -; DUMP: AllocType 1 StackIds: 2 -; DUMP: (clone 0) -; DUMP: AllocTypes: NotCold -; DUMP: ContextIds: 2 3 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 -; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] - -; DUMP: Node [[D:0x[a-z0-9]+]] -; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0) -; DUMP: AllocTypes: NotColdCold -; DUMP: ContextIds: 1 6 -; DUMP: CalleeEdges: -; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 -; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 -; DUMP: CallerEdges: - -; DUMP: Node [[C]] -; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0) -; DUMP: AllocTypes: NotColdCold -; DUMP: ContextIds: 2 5 -; DUMP: CalleeEdges: -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 -; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 -; DUMP: CallerEdges: - -; DUMP: Node [[B]] -; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0) -; DUMP: AllocTypes: NotCold -; DUMP: ContextIds: 3 4 -; DUMP: CalleeEdges: -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 -; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 -; DUMP: CallerEdges: - -; DUMP: Node [[ENEW2ORIG]] -; DUMP: Versions: 1 MIB: -; DUMP: AllocType 1 StackIds: 2 -; DUMP: AllocType 2 StackIds: 1 -; DUMP: AllocType 1 StackIds: 0 -; DUMP: (clone 0) -; DUMP: AllocTypes: NotCold -; DUMP: ContextIds: 4 6 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 -; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 -; DUMP: Clones: [[ENEW2CLONE]] - -; DUMP: Node [[ENEW1CLONE]] -; DUMP: Versions: 1 MIB: -; DUMP: AllocType 2 StackIds: 0 -; DUMP: AllocType 1 StackIds: 1 -; DUMP: AllocType 1 StackIds: 2 -; DUMP: (clone 0) -; DUMP: AllocTypes: Cold -; DUMP: ContextIds: 1 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 -; DUMP: Clone of [[ENEW1ORIG]] - -; DUMP: Node [[ENEW2CLONE]] -; DUMP: Versions: 1 MIB: -; DUMP: AllocType 1 StackIds: 2 -; DUMP: AllocType 2 StackIds: 1 -; DUMP: AllocType 1 StackIds: 0 -; DUMP: (clone 0) -; DUMP: AllocTypes: Cold -; DUMP: ContextIds: 5 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 -; DUMP: Clone of [[ENEW2ORIG]] - - -; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll index c311d6243688f..bd9f5e9250592 100644 --- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. Also tests graph and IR cloning. +;; cloning. ;; ;; Original code looks like: ;; @@ -64,9 +64,7 @@ ; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call @@ -74,26 +72,6 @@ ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -;; Try again but with distributed ThinLTO -; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ -; RUN: -thinlto-distributed-indexes \ -; RUN: -r=%t.o,main,plx \ -; RUN: -r=%t.o,_ZdaPv, \ -; RUN: -r=%t.o,sleep, \ -; RUN: -r=%t.o,_Znam, \ -; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ -; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS - -; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT -;; We should only create a single clone of foo, for the direct call -;; from main allocating cold memory. -; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED - source_filename = "indirectcall.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -384,11 +362,6 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 } ; DUMP: Clone of [[FOO]] -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"]; diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll index 27eab8a5bcd20..e87168b4e3f92 100644 --- a/llvm/test/ThinLTO/X86/memprof-inlined.ll +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -1,7 +1,6 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. -;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -52,9 +51,7 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate @@ -62,24 +59,6 @@ ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -;; Try again but with distributed ThinLTO -; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ -; RUN: -thinlto-distributed-indexes \ -; RUN: -r=%t.o,main,plx \ -; RUN: -r=%t.o,_ZdaPv, \ -; RUN: -r=%t.o,sleep, \ -; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS - -; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT -;; We should create clones for foo and bar for the call from main to allocate -;; cold memory. -; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED - source_filename = "inlined.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -281,11 +260,6 @@ declare i32 @sleep() ; DUMP: Clone of [[BAR]] -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll index bd938754ce9d0..99a8d68a5b1d2 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll @@ -1,5 +1,5 @@ ;; Test callsite context graph generation for simple call graph with -;; two memprof contexts and no inlining, as well as graph and IR cloning. +;; two memprof contexts and no inlining. ;; ;; Original code looks like: ;; @@ -37,9 +37,7 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ -; RUN: --check-prefix=STATS --check-prefix=REMARKS +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -227,48 +225,6 @@ attributes #6 = { builtin } ; DUMP: Clone of [[BAR]] -; REMARKS: created clone _Z3barv.memprof.1 -; REMARKS: created clone _Z3bazv.memprof.1 -; REMARKS: created clone _Z3foov.memprof.1 -; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 -; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 -; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 -; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold -; REMARKS: call in clone main assigned to call function clone _Z3foov -; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv -; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv -; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold - - -; IR: define {{.*}} @main -;; The first call to foo does not allocate cold memory. It should call the -;; original functions, which ultimately call the original allocation decorated -;; with a "notcold" attribute. -; IR: call {{.*}} @_Z3foov() -;; The second call to foo allocates cold memory. It should call cloned functions -;; which ultimately call a cloned allocation decorated with a "cold" attribute. -; IR: call {{.*}} @_Z3foov.memprof.1() -; IR: define internal {{.*}} @_Z3barv() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv() -; IR: call {{.*}} @_Z3barv() -; IR: define internal {{.*}} @_Z3foov() -; IR: call {{.*}} @_Z3bazv() -; IR: define internal {{.*}} @_Z3barv.memprof.1() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.memprof.1() -; IR: call {{.*}} @_Z3barv.memprof.1() -; IR: define internal {{.*}} @_Z3foov.memprof.1() -; IR: call {{.*}} @_Z3bazv.memprof.1() -; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } -; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } - - -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll index 1f23ad3c6a51b..143f892c18950 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll @@ -1,8 +1,7 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. Also tests graph and IR -;; cloning. +;; while matching callsite nodes onto the graph. ;; ;; Original code looks like: ;; @@ -59,9 +58,7 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ -; RUN: --check-prefix=STATS --check-prefix=REMARKS +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST @@ -269,39 +266,6 @@ attributes #6 = { builtin } ; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 ; DUMP: Clone of [[D]] -; REMARKS: created clone _Z1Dv.memprof.1 -; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1 -; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.memprof.1 -; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1 -; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold -; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv -; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold - - -;; The allocation via F does not allocate cold memory. It should call the -;; original D, which ultimately call the original allocation decorated -;; with a "notcold" attribute. -; IR: define internal {{.*}} @_Z1Dv() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z1Fv() -; IR: call {{.*}} @_Z1Dv() -;; The allocations via B and E allocate cold memory. They should call the -;; cloned D, which ultimately call the cloned allocation decorated with a -;; "cold" attribute. -; IR: define internal {{.*}} @_Z1Bv() -; IR: call {{.*}} @_Z1Dv.memprof.1() -; IR: define internal {{.*}} @_Z1Ev() -; IR: call {{.*}} @_Z1Dv.memprof.1() -; IR: define internal {{.*}} @_Z1Dv.memprof.1() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] -; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } -; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } - - -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis - ; DOTPRE: digraph "prestackupdate" { ; DOTPRE: label="prestackupdate"; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll deleted file mode 100644 index b94e9b855b747..0000000000000 --- a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll +++ /dev/null @@ -1,247 +0,0 @@ -;; Test context disambiguation for a callgraph containing multiple memprof -;; contexts and no inlining, where we need to perform additional cloning -;; during function assignment/cloning to handle the combination of contexts -;; to 2 different allocations. -;; -;; void E(char **buf1, char **buf2) { -;; *buf1 = new char[10]; -;; *buf2 = new char[10]; -;; } -;; -;; void B(char **buf1, char **buf2) { -;; E(buf1, buf2); -;; } -;; -;; void C(char **buf1, char **buf2) { -;; E(buf1, buf2); -;; } -;; -;; void D(char **buf1, char **buf2) { -;; E(buf1, buf2); -;; } -;; int main(int argc, char **argv) { -;; char *cold1, *cold2, *default1, *default2, *default3, *default4; -;; B(&default1, &default2); -;; C(&default3, &cold1); -;; D(&cold2, &default4); -;; memset(cold1, 0, 10); -;; memset(cold2, 0, 10); -;; memset(default1, 0, 10); -;; memset(default2, 0, 10); -;; memset(default3, 0, 10); -;; memset(default4, 0, 10); -;; delete[] default1; -;; delete[] default2; -;; delete[] default3; -;; delete[] default4; -;; sleep(10); -;; delete[] cold1; -;; delete[] cold2; -;; return 0; -;; } -;; -;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the -;; memory freed after sleep(10) results in cold lifetimes. -;; -;; The IR was then reduced using llvm-reduce with the expected FileCheck input. - -;; -stats requires asserts -; REQUIRES: asserts - -; RUN: opt -passes=memprof-context-disambiguation \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ -; RUN: --check-prefix=STATS --check-prefix=REMARKS - - -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { -entry: - %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7 - %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15 - ret void -} - -declare ptr @_Znam(i64) #1 - -define internal void @_Z1BPPcS0_(ptr %0, ptr %1) { -entry: - call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16 - ret void -} - -; Function Attrs: noinline -define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 { -entry: - call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17 - ret void -} - -define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 { -entry: - call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18 - ret void -} - -; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) -declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 - -declare i32 @sleep() #5 - -; uselistorder directives -uselistorder ptr @_Znam, { 1, 0 } - -attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } -attributes #1 = { "no-trapping-math"="true" } -attributes #2 = { noinline } -attributes #3 = { "frame-pointer"="all" } -attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } -attributes #5 = { "disable-tail-calls"="true" } -attributes #6 = { builtin } - -!0 = !{!1, !3, !5} -!1 = !{!2, !"cold"} -!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} -!3 = !{!4, !"notcold"} -!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} -!5 = !{!6, !"notcold"} -!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} -!7 = !{i64 -3461278137325233666} -!8 = !{!9, !11, !13} -!9 = !{!10, !"notcold"} -!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} -!11 = !{!12, !"cold"} -!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} -!13 = !{!14, !"notcold"} -!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} -!15 = !{i64 -1415475215210681400} -!16 = !{i64 -2441057035866683071} -!17 = !{i64 -3483158674395044949} -!18 = !{i64 -7799663586031895603} - - -;; Originally we create a single clone of each call to new from E, since each -;; allocates cold memory for a single caller. - -; DUMP: CCG after cloning: -; DUMP: Callsite Context Graph: -; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] -; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) -; DUMP: AllocTypes: NotCold -; DUMP: ContextIds: 2 3 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 -; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] - -; DUMP: Node [[D:0x[a-z0-9]+]] -; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) -; DUMP: AllocTypes: NotColdCold -; DUMP: ContextIds: 1 6 -; DUMP: CalleeEdges: -; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 -; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 -; DUMP: CallerEdges: - -; DUMP: Node [[C]] -; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) -; DUMP: AllocTypes: NotColdCold -; DUMP: ContextIds: 2 5 -; DUMP: CalleeEdges: -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 -; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 -; DUMP: CallerEdges: - -; DUMP: Node [[B]] -; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) -; DUMP: AllocTypes: NotCold -; DUMP: ContextIds: 3 4 -; DUMP: CalleeEdges: -; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 -; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 -; DUMP: CallerEdges: - -; DUMP: Node [[ENEW2ORIG]] -; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) -; DUMP: AllocTypes: NotCold -; DUMP: ContextIds: 4 6 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 -; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 -; DUMP: Clones: [[ENEW2CLONE]] - -; DUMP: Node [[ENEW1CLONE]] -; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) -; DUMP: AllocTypes: Cold -; DUMP: ContextIds: 1 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 -; DUMP: Clone of [[ENEW1ORIG]] - -; DUMP: Node [[ENEW2CLONE]] -; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) -; DUMP: AllocTypes: Cold -; DUMP: ContextIds: 5 -; DUMP: CalleeEdges: -; DUMP: CallerEdges: -; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 -; DUMP: Clone of [[ENEW2ORIG]] - - -;; We greedily create a clone of E that is initially used by the clones of the -;; first call to new. However, we end up with an incompatible set of callers -;; given the second call to new which has clones with a different combination of -;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. -; REMARKS: created clone _Z1EPPcS0_.memprof.1 -; REMARKS: created clone _Z1EPPcS0_.memprof.2 -; REMARKS: created clone _Z1EPPcS0_.memprof.3 -; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2 -; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold -; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3 -; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold -; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_ -; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold -; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold -; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold -; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold - - -;; Original version of E is used for the non-cold allocations, both from B. -; IR: define internal {{.*}} @_Z1EPPcS0_( -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] -; IR: define internal {{.*}} @_Z1BPPcS0_( -; IR: call {{.*}} @_Z1EPPcS0_( -;; C calls a clone of E with the first new allocating cold memory and the -;; second allocating non-cold memory. -; IR: define internal {{.*}} @_Z1CPPcS0_( -; IR: call {{.*}} @_Z1EPPcS0_.memprof.3( -;; D calls a clone of E with the first new allocating non-cold memory and the -;; second allocating cold memory. -; IR: define internal {{.*}} @_Z1DPPcS0_( -; IR: call {{.*}} @_Z1EPPcS0_.memprof.2( -;; Transient clone that will get removed as it ends up with no callers. -;; Its calls to new never get updated with a memprof attribute as a result. -; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.1( -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]] -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]] -; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2( -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] -; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3( -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] -; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } -; IR: attributes #[[DEFAULT]] = { builtin } -; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } - - -; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll index f3216aa13d88f..49ca9407d9250 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. Also tests graph and IR cloning. +;; cloning. ;; ;; Original code looks like: ;; @@ -57,9 +57,7 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ -; RUN: --check-prefix=STATS --check-prefix=REMARKS +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call @@ -345,41 +343,6 @@ attributes #7 = { builtin } ; DUMP: Clone of [[FOO]] -; REMARKS: created clone _Z3foov.memprof.1 -; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 -; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold -; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov -; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov -; REMARKS: call in clone main assigned to call function clone _Z3foov -; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold - - -; IR: define {{.*}} @main( -; IR: call {{.*}} @_Z3foov() -;; Only the second call to foo, which allocates cold memory via direct calls, -;; is replaced with a call to a clone that calls a cold allocation. -; IR: call {{.*}} @_Z3foov.memprof.1() -; IR: call {{.*}} @_Z3barP1A( -; IR: call {{.*}} @_Z3barP1A( -; IR: call {{.*}} @_Z3barP1A( -; IR: call {{.*}} @_Z3barP1A( -; IR: define internal {{.*}} @_ZN1A1xEv( -; IR: call {{.*}} @_Z3foov() -; IR: define internal {{.*}} @_ZN1B1xEv( -; IR: call {{.*}} @_Z3foov() -; IR: define internal {{.*}} @_Z3foov() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3foov.memprof.1() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] -; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } -; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } - - -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll index f1b74f13fb148..70a6f39980ede 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll @@ -1,7 +1,6 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. -;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -47,9 +46,7 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -stats -pass-remarks=memprof-context-disambiguation \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ -; RUN: --check-prefix=STATS --check-prefix=REMARKS +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate @@ -257,42 +254,6 @@ attributes #7 = { builtin } ; DUMP: Clone of [[BAR]] -; REMARKS: created clone _Z3barv.memprof.1 -; REMARKS: created clone _Z3foov.memprof.1 -; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 -; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1 -; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold -; REMARKS: call in clone main assigned to call function clone _Z3foov -; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv -; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold -; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold - - -; IR: define internal {{.*}} @_Z3barv() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3foov() -; IR: call {{.*}} @_Z3barv() -; IR: define {{.*}} @main() -;; The first call to foo does not allocate cold memory. It should call the -;; original functions, which ultimately call the original allocation decorated -;; with a "notcold" attribute. -; IR: call {{.*}} @_Z3foov() -;; The second call to foo allocates cold memory. It should call cloned functions -;; which ultimately call a cloned allocation decorated with a "cold" attribute. -; IR: call {{.*}} @_Z3foov.memprof.1() -; IR: define internal {{.*}} @_Z3barv.memprof.1() -; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3foov.memprof.1() -; IR: call {{.*}} @_Z3barv.memprof.1() -; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } -; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } - - -; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) -; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) -; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis - - ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];