diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index fd35de571a0d9..4b05bae9da007 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -4526,6 +4526,16 @@ void CallsiteContextGraph:: // If Clone not already assigned to a function clone: // Assign to first function clone without assignment // Assign caller to selected function clone +// For each call with graph Node having clones: +// If number func clones > number call's callsite Node clones: +// Record func CallInfo clones without Node clone in UnassignedCallClones +// For callsite Nodes in DFS order from allocations: +// If IsAllocation: +// Update allocation with alloc type +// Else: +// For Call, all MatchingCalls, and associated UnnassignedCallClones: +// Update call to call recorded callee clone +// template bool CallsiteContextGraph::assignFunctions() { bool Changed = false; @@ -4553,6 +4563,34 @@ bool CallsiteContextGraph::assignFunctions() { DenseMap CallMap; }; + // Map to keep track of information needed to update calls in function clones + // when their corresponding callsite node was not itself cloned for that + // function clone. Because of call context pruning (i.e. we only keep as much + // caller information as needed to distinguish hot vs cold), we may not have + // caller edges coming to each callsite node from all possible function + // callers. A function clone may get created for other callsites in the + // function for which there are caller edges that were not pruned. Any other + // callsites in that function clone, which were not themselved cloned for + // that function clone, should get updated the same way as the corresponding + // callsite in the original function (which may call a clone of its callee). + // + // We build this map after completing function cloning for each function, so + // that we can record the information from its call maps before they are + // destructed. The map will be used as we update calls to update any still + // unassigned call clones. Note that we may create new node clones as we clone + // other functions, so later on we check which node clones were still not + // created. To this end, the inner map is a map from function clone number to + // the list of calls cloned for that function (can be more than one due to the + // Node's MatchingCalls array). + // + // The alternative is creating new callsite clone nodes below as we clone the + // function, but that is tricker to get right and likely more overhead. + // + // Inner map is a std::map so sorted by key (clone number), in order to get + // ordered remarks in the full LTO case. + DenseMap>> + UnassignedCallClones; + // Walk all functions for which we saw calls with memprof metadata, and handle // cloning for each of its calls. for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { @@ -4986,6 +5024,63 @@ bool CallsiteContextGraph::assignFunctions() { } } } + + if (FuncCloneInfos.size() < 2) + continue; + + // In this case there is more than just the original function copy. + // Record call clones of any callsite nodes in the function that did not + // themselves get cloned for all of the function clones. + for (auto &Call : CallsWithMetadata) { + ContextNode *Node = getNodeForInst(Call); + if (!Node || !Node->hasCall() || Node->emptyContextIds()) + continue; + // If Node has enough clones already to cover all function clones, we can + // skip it. Need to add one for the original copy. + // Use >= in case there were clones that were skipped due to having empty + // context ids + if (Node->Clones.size() + 1 >= FuncCloneInfos.size()) + continue; + // First collect all function clones we cloned this callsite node for. + // They may not be sequential due to empty clones e.g. + DenseSet NodeCallClones; + for (auto *C : Node->Clones) + NodeCallClones.insert(C->Call.cloneNo()); + unsigned I = 0; + // Now check all the function clones. + for (auto &FC : FuncCloneInfos) { + // Function clones should be sequential. + assert(FC.FuncClone.cloneNo() == I); + // Skip the first clone which got the original call. + // Also skip any other clones created for this Node. + if (++I == 1 || NodeCallClones.contains(I)) { + continue; + } + // Record the call clones created for this callsite in this function + // clone. + auto &CallVector = UnassignedCallClones[Node][I]; + DenseMap &CallMap = FC.CallMap; + if (auto It = CallMap.find(Call); It != CallMap.end()) { + CallInfo CallClone = It->second; + CallVector.push_back(CallClone); + } else { + // All but the original clone (skipped earlier) should have an entry + // for all calls. + assert(false && "Expected to find call in CallMap"); + } + // Need to do the same for all matching calls. + for (auto &MatchingCall : Node->MatchingCalls) { + if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) { + CallInfo CallClone = It->second; + CallVector.push_back(CallClone); + } else { + // All but the original clone (skipped earlier) should have an entry + // for all calls. + assert(false && "Expected to find call in CallMap"); + } + } + } + } } uint8_t BothTypes = @@ -5047,6 +5142,26 @@ bool CallsiteContextGraph::assignFunctions() { // Update all the matching calls as well. for (auto &Call : Node->MatchingCalls) updateCall(Call, CalleeFunc); + + // Now update all calls recorded earlier that are still in function clones + // which don't have a clone of this callsite node. + if (!UnassignedCallClones.contains(Node)) + return; + DenseSet NodeCallClones; + for (auto *C : Node->Clones) + NodeCallClones.insert(C->Call.cloneNo()); + // Note that we already confirmed Node is in this map a few lines above. + auto &ClonedCalls = UnassignedCallClones[Node]; + for (auto &[CloneNo, CallVector] : ClonedCalls) { + // Should start at 1 as we never create an entry for original node. + assert(CloneNo > 0); + // If we subsequently created a clone, skip this one. + if (NodeCallClones.contains(CloneNo)) + continue; + // Use the original Node's CalleeFunc. + for (auto &Call : CallVector) + updateCall(Call, CalleeFunc); + } }; // Performs DFS traversal starting from allocation nodes to update calls to diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll new file mode 100644 index 0000000000000..bcd3cea5b7ff1 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll @@ -0,0 +1,142 @@ +;; Similar to funcassigncloning.ll but hand modified to add another allocation +;; whose pruned cold context only includes an immediate caller node that itself +;; doesn't need cloning, but calls a cloned allocating function, and is in a +;; function that gets cloned multiple times for a different callsite. This test +;; makes sure the non-cloned callsite is correctly updated in all function +;; clones. This case was missed because, due to context pruning, we don't have +;; any caller edges for the first callsite, so the handling that kicks in to +;; "reclone" other callsites in cloned functions was being missed. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=REMARKS + + +source_filename = "funcassigncloning.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; Eventually this function will be cloned several times (for the calls to new +;; for the various callers). However, function blah() includes an allocation +;; whose cold context was trimmed above here. We therefore should assume that +;; every caller of this function should call the same version of blah (which +;; will be the cloned ".memprof.1" version. +; Function Attrs: noinline optnone +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + call void @blah(), !callsite !19 + %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 + %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 + ret void +} + +; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1 + +; IR: define {{.*}} @_Z1EPPcS0_ +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.2 +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.3 +; IR: call {{.*}} @blah.memprof.1() + +declare ptr @_Znam(i64) + +define internal void @_Z1BPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16 + ret void +} + +define internal void @_Z1CPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18 + ret void +} + +; Function Attrs: noinline optnone +define i32 @main() #0 { +entry: + call void @_Z1BPPcS0_() + call void @_Z1CPPcS0_() + call void @_Z1DPPcS0_() + ret i32 0 +} + +define internal void @blah() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21 + ret void +} + +define internal void @foo() #0 { +entry: + call void @blah(), !callsite !20 + ret void +} + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0, 2 } + +attributes #0 = { noinline optnone } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} +!19 = !{i64 123} +!20 = !{i64 234} +!21 = !{i64 345} +!22 = !{!23, !25} +!23 = !{!24, !"cold"} +!24 = !{i64 345, i64 123} +!25 = !{!26, !"notcold"} +!26 = !{i64 345, i64 234} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll new file mode 100644 index 0000000000000..18def1d41c30c --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll @@ -0,0 +1,122 @@ +;; Similar to funcassigncloning.ll but hand modified to add another allocation +;; whose pruned cold context only includes an immediate caller node that itself +;; doesn't need cloning, but calls a cloned allocating function, and is in a +;; function that gets cloned multiple times for a different callsite. This test +;; makes sure the non-cloned callsite is correctly updated in all function +;; clones. This case was missed because, due to context pruning, we don't have +;; any caller edges for the first callsite, so the handling that kicks in to +;; "reclone" other callsites in cloned functions was being missed. + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR --check-prefix=REMARKS + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; Eventually this function will be cloned several times (for the calls to new +;; for the various callers). However, function blah() includes an allocation +;; whose cold context was trimmed above here. We therefore should assume that +;; every caller of this function should call the same version of blah (which +;; will be the cloned ".memprof.1" version. +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + call void @blah(), !callsite !19 + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7 + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15 + ret void +} + +; REMARKS: created clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1 + +; IR: define {{.*}} @_Z1EPPcS0_ +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.1 +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.2 +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.3 +; IR: call {{.*}} @blah.memprof.1() + +declare ptr @_Znam(i64) #1 + +define internal void @_Z1BPPcS0_(ptr %0, ptr %1) { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16 + ret void +} + +; Function Attrs: noinline +define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18 + ret void +} + +define internal void @blah() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21 + ret void +} + +define internal void @foo() #0 { +entry: + call void @blah(), !callsite !20 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +declare i32 @sleep() #5 + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0, 2 } + +attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } +attributes #1 = { "no-trapping-math"="true" } +attributes #2 = { noinline } +attributes #3 = { "frame-pointer"="all" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { "disable-tail-calls"="true" } +attributes #6 = { builtin } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} +!19 = !{i64 123} +!20 = !{i64 234} +!21 = !{i64 345} +!22 = !{!23, !25} +!23 = !{!24, !"cold"} +!24 = !{i64 345, i64 123} +!25 = !{!26, !"notcold"} +!26 = !{i64 345, i64 234} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll index e301fa03ea099..0bf622276b328 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll @@ -47,7 +47,7 @@ ; RUN: -memprof-allow-recursive-callsites=true \ ; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ +; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS ;; Skipping recursive callsites should result in no cloning. @@ -56,7 +56,7 @@ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ +; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof" \ ; RUN: --implicit-check-not="created clone" \ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" \ ; RUN: --check-prefix=ALL @@ -87,7 +87,7 @@ ; RUN: -memprof-allow-recursive-contexts=false \ ; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ +; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1