-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[MemProf] Suppress duplicate clones in the LTO backend #161551
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
teresajohnson
merged 2 commits into
llvm:main
from
teresajohnson:memprof_duplicate_clones
Oct 3, 2025
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
#include "llvm/ADT/SmallSet.h" | ||
#include "llvm/ADT/SmallVector.h" | ||
#include "llvm/ADT/Statistic.h" | ||
#include "llvm/ADT/StringExtras.h" | ||
#include "llvm/Analysis/MemoryProfileInfo.h" | ||
#include "llvm/Analysis/ModuleSummaryAnalysis.h" | ||
#include "llvm/Analysis/OptimizationRemarkEmitter.h" | ||
|
@@ -40,6 +41,7 @@ | |
#include "llvm/Support/CommandLine.h" | ||
#include "llvm/Support/GraphWriter.h" | ||
#include "llvm/Support/InterleavedRange.h" | ||
#include "llvm/Support/SHA1.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
#include "llvm/Transforms/IPO.h" | ||
#include "llvm/Transforms/Utils/CallPromotionUtils.h" | ||
|
@@ -60,6 +62,9 @@ STATISTIC(FunctionClonesThinBackend, | |
"Number of function clones created during ThinLTO backend"); | ||
STATISTIC(FunctionsClonedThinBackend, | ||
"Number of functions that had clones created during ThinLTO backend"); | ||
STATISTIC( | ||
FunctionCloneDuplicatesThinBackend, | ||
"Number of function clone duplicates detected during ThinLTO backend"); | ||
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " | ||
"cloned) during whole program analysis"); | ||
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " | ||
|
@@ -5186,19 +5191,127 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { | |
return Changed; | ||
} | ||
|
||
// Compute a SHA1 hash of the callsite and alloc version information of clone I | ||
// in the summary, to use in detection of duplicate clones. | ||
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) { | ||
SHA1 Hasher; | ||
// Update hash with any callsites that call non-default (non-zero) callee | ||
// versions. | ||
for (auto &SN : FS->callsites()) { | ||
// In theory all callsites and allocs in this function should have the same | ||
// number of clone entries, but handle any discrepancies gracefully below | ||
// for NDEBUG builds. | ||
assert( | ||
SN.Clones.size() > I && | ||
"Callsite summary has fewer entries than other summaries in function"); | ||
if (SN.Clones.size() <= I || !SN.Clones[I]) | ||
continue; | ||
uint8_t Data[sizeof(SN.Clones[I])]; | ||
support::endian::write32le(Data, SN.Clones[I]); | ||
Hasher.update(Data); | ||
} | ||
// Update hash with any allocs that have non-default (non-None) hints. | ||
for (auto &AN : FS->allocs()) { | ||
// In theory all callsites and allocs in this function should have the same | ||
// number of clone entries, but handle any discrepancies gracefully below | ||
// for NDEBUG builds. | ||
assert(AN.Versions.size() > I && | ||
"Alloc summary has fewer entries than other summaries in function"); | ||
if (AN.Versions.size() <= I || | ||
(AllocationType)AN.Versions[I] == AllocationType::None) | ||
continue; | ||
Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1)); | ||
} | ||
return support::endian::read64le(Hasher.result().data()); | ||
} | ||
|
||
static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( | ||
Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, | ||
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> | ||
&FuncToAliasMap) { | ||
&FuncToAliasMap, | ||
FunctionSummary *FS) { | ||
auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) { | ||
// We might have created this when adjusting callsite in another | ||
// function. It should be a declaration. | ||
assert(DeclGV->isDeclaration()); | ||
NewGV->takeName(DeclGV); | ||
DeclGV->replaceAllUsesWith(NewGV); | ||
DeclGV->eraseFromParent(); | ||
}; | ||
|
||
// Handle aliases to this function, and create analogous alias clones to the | ||
// provided clone of this function. | ||
auto CloneFuncAliases = [&](Function *NewF, unsigned I) { | ||
if (!FuncToAliasMap.count(&F)) | ||
return; | ||
for (auto *A : FuncToAliasMap[&F]) { | ||
std::string AliasName = getMemProfFuncName(A->getName(), I); | ||
auto *PrevA = M.getNamedAlias(AliasName); | ||
auto *NewA = GlobalAlias::create(A->getValueType(), | ||
A->getType()->getPointerAddressSpace(), | ||
A->getLinkage(), AliasName, NewF); | ||
NewA->copyAttributesFrom(A); | ||
if (PrevA) | ||
TakeDeclNameAndReplace(PrevA, NewA); | ||
} | ||
}; | ||
|
||
// The first "clone" is the original copy, we should only call this if we | ||
// needed to create new clones. | ||
assert(NumClones > 1); | ||
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; | ||
VMaps.reserve(NumClones - 1); | ||
FunctionsClonedThinBackend++; | ||
|
||
// Map of hash of callsite/alloc versions to the instantiated function clone | ||
// (possibly the original) implementing those calls. Used to avoid | ||
// instantiating duplicate function clones. | ||
// FIXME: Ideally the thin link would not generate such duplicate clones to | ||
// start with, but right now it happens due to phase ordering in the function | ||
// assignment and possible new clones that produces. We simply make each | ||
// duplicate an alias to the matching instantiated clone recorded in the map | ||
// (except for available_externally which are made declarations as they would | ||
// be aliases in the prevailing module, and available_externally aliases are | ||
// not well supported right now). | ||
DenseMap<uint64_t, Function *> HashToFunc; | ||
|
||
// Save the hash of the original function version. | ||
HashToFunc[ComputeHash(FS, 0)] = &F; | ||
|
||
for (unsigned I = 1; I < NumClones; I++) { | ||
VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); | ||
std::string Name = getMemProfFuncName(F.getName(), I); | ||
auto Hash = ComputeHash(FS, I); | ||
// If this clone would duplicate a previously seen clone, don't generate the | ||
// duplicate clone body, just make an alias to satisfy any (potentially | ||
// cross-module) references. | ||
if (HashToFunc.contains(Hash)) { | ||
FunctionCloneDuplicatesThinBackend++; | ||
auto *Func = HashToFunc[Hash]; | ||
if (Func->hasAvailableExternallyLinkage()) { | ||
// Skip these as EliminateAvailableExternallyPass does not handle | ||
// available_externally aliases correctly and we end up with an | ||
// available_externally alias to a declaration. Just create a | ||
// declaration for now as we know we will have a definition in another | ||
// module. | ||
auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType()); | ||
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) | ||
<< "created clone decl " << ore::NV("Decl", Decl.getCallee())); | ||
continue; | ||
} | ||
auto *PrevF = M.getFunction(Name); | ||
auto *Alias = GlobalAlias::create(Name, Func); | ||
if (PrevF) | ||
TakeDeclNameAndReplace(PrevF, Alias); | ||
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) | ||
<< "created clone alias " << ore::NV("Alias", Alias)); | ||
|
||
// Now handle aliases to this function, and clone those as well. | ||
CloneFuncAliases(Func, I); | ||
continue; | ||
} | ||
auto *NewF = CloneFunction(&F, *VMaps.back()); | ||
HashToFunc[Hash] = NewF; | ||
FunctionClonesThinBackend++; | ||
// Strip memprof and callsite metadata from clone as they are no longer | ||
// needed. | ||
|
@@ -5208,40 +5321,17 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( | |
Inst.setMetadata(LLVMContext::MD_callsite, nullptr); | ||
} | ||
} | ||
std::string Name = getMemProfFuncName(F.getName(), I); | ||
auto *PrevF = M.getFunction(Name); | ||
if (PrevF) { | ||
// We might have created this when adjusting callsite in another | ||
// function. It should be a declaration. | ||
assert(PrevF->isDeclaration()); | ||
NewF->takeName(PrevF); | ||
PrevF->replaceAllUsesWith(NewF); | ||
PrevF->eraseFromParent(); | ||
} else | ||
if (PrevF) | ||
TakeDeclNameAndReplace(PrevF, NewF); | ||
else | ||
NewF->setName(Name); | ||
updateSubprogramLinkageName(NewF, Name); | ||
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) | ||
<< "created clone " << ore::NV("NewFunction", NewF)); | ||
|
||
// Now handle aliases to this function, and clone those as well. | ||
if (!FuncToAliasMap.count(&F)) | ||
continue; | ||
for (auto *A : FuncToAliasMap[&F]) { | ||
std::string Name = getMemProfFuncName(A->getName(), I); | ||
auto *PrevA = M.getNamedAlias(Name); | ||
auto *NewA = GlobalAlias::create(A->getValueType(), | ||
A->getType()->getPointerAddressSpace(), | ||
A->getLinkage(), Name, NewF); | ||
NewA->copyAttributesFrom(A); | ||
if (PrevA) { | ||
// We might have created this when adjusting callsite in another | ||
// function. It should be a declaration. | ||
assert(PrevA->isDeclaration()); | ||
NewA->takeName(PrevA); | ||
PrevA->replaceAllUsesWith(NewA); | ||
PrevA->eraseFromParent(); | ||
} | ||
} | ||
CloneFuncAliases(NewF, I); | ||
} | ||
return VMaps; | ||
} | ||
|
@@ -5401,7 +5491,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; | ||
bool ClonesCreated = false; | ||
unsigned NumClonesCreated = 0; | ||
auto CloneFuncIfNeeded = [&](unsigned NumClones) { | ||
auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) { | ||
// We should at least have version 0 which is the original copy. | ||
assert(NumClones > 0); | ||
// If only one copy needed use original. | ||
|
@@ -5415,7 +5505,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
assert(NumClonesCreated == NumClones); | ||
return; | ||
} | ||
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); | ||
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS); | ||
// The first "clone" is the original copy, which doesn't have a VMap. | ||
assert(VMaps.size() == NumClones - 1); | ||
Changed = true; | ||
|
@@ -5424,9 +5514,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
}; | ||
|
||
auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, | ||
Function *CalledFunction) { | ||
Function *CalledFunction, FunctionSummary *FS) { | ||
// Perform cloning if not yet done. | ||
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); | ||
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS); | ||
|
||
assert(!isMemProfClone(*CalledFunction)); | ||
|
||
|
@@ -5448,6 +5538,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
// below. | ||
auto CalleeOrigName = CalledFunction->getName(); | ||
for (unsigned J = 0; J < StackNode.Clones.size(); J++) { | ||
// If the VMap is empty, this clone was a duplicate of another and was | ||
// created as an alias or a declaration. | ||
if (J > 0 && VMaps[J - 1]->empty()) | ||
continue; | ||
Comment on lines
+5543
to
+5544
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if this small block of code warrants a comment:
The same applies to the three more instances of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
// Do nothing if this version calls the original version of its | ||
// callee. | ||
if (!StackNode.Clones[J]) | ||
|
@@ -5591,7 +5685,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
#endif | ||
|
||
// Perform cloning if not yet done. | ||
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); | ||
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS); | ||
|
||
OrigAllocsThinBackend++; | ||
AllocVersionsThinBackend += AllocNode.Versions.size(); | ||
|
@@ -5624,6 +5718,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
|
||
// Update the allocation types per the summary info. | ||
for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { | ||
// If the VMap is empty, this clone was a duplicate of another and | ||
// was created as an alias or a declaration. | ||
if (J > 0 && VMaps[J - 1]->empty()) | ||
continue; | ||
// Ignore any that didn't get an assigned allocation type. | ||
if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) | ||
continue; | ||
|
@@ -5671,7 +5769,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
// we don't need to do ICP, but might need to clone this | ||
// function as it is the target of other cloned calls. | ||
if (NumClones) | ||
CloneFuncIfNeeded(NumClones); | ||
CloneFuncIfNeeded(NumClones, FS); | ||
} | ||
|
||
else { | ||
|
@@ -5691,7 +5789,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
} | ||
#endif | ||
|
||
CloneCallsite(StackNode, CB, CalledFunction); | ||
CloneCallsite(StackNode, CB, CalledFunction, FS); | ||
} | ||
} else if (CB->isTailCall() && CalledFunction) { | ||
// Locate the synthesized callsite info for the callee VI, if any was | ||
|
@@ -5701,7 +5799,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { | |
if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { | ||
auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); | ||
assert(Callsite != MapTailCallCalleeVIToCallsite.end()); | ||
CloneCallsite(Callsite->second, CB, CalledFunction); | ||
CloneCallsite(Callsite->second, CB, CalledFunction, FS); | ||
} | ||
} | ||
} | ||
|
@@ -5847,6 +5945,10 @@ void MemProfContextDisambiguation::performICP( | |
// check. | ||
CallBase *CBClone = CB; | ||
for (unsigned J = 0; J < NumClones; J++) { | ||
// If the VMap is empty, this clone was a duplicate of another and was | ||
// created as an alias or a declaration. | ||
if (J > 0 && VMaps[J - 1]->empty()) | ||
continue; | ||
// Copy 0 is the original function. | ||
if (J > 0) | ||
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); | ||
|
@@ -5892,6 +5994,10 @@ void MemProfContextDisambiguation::performICP( | |
// TotalCount and the number promoted. | ||
CallBase *CBClone = CB; | ||
for (unsigned J = 0; J < NumClones; J++) { | ||
// If the VMap is empty, this clone was a duplicate of another and was | ||
// created as an alias or a declaration. | ||
if (J > 0 && VMaps[J - 1]->empty()) | ||
continue; | ||
// Copy 0 is the original function. | ||
if (J > 0) | ||
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This lambda and the one below were refactored out of existing code to enable reuse